diff --git a/.gitattributes b/.gitattributes index cc2d7e7bcc456102ee879502c1c0c1311f4f1098..3908da47dcdf344e365c9d7e9fa7d24895f8dbb3 100644 --- a/.gitattributes +++ b/.gitattributes @@ -34,3 +34,59 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-2900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-3900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-4900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..949a98efd228358c4d1d1ad461114a3ee2d232f8 --- /dev/null +++ b/README.md @@ -0,0 +1,63 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +model_name: telugu +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +licence: license +pipeline_tag: text-generation +--- + +# Model Card for telugu + +This model is a fine-tuned version of [unsloth/gemma-4-E4B-it](https://huggingface.co/unsloth/gemma-4-E4B-it). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/rohithsaimidigudla-omnisynkai/gemma-health-adapters/runs/mwc9jt0z) + + +This model was trained with SFT. + +### Framework versions + +- PEFT 0.19.1 +- TRL: 0.19.1 +- Transformers: 5.5.0 +- Pytorch: 2.7.0+cu128 +- Datasets: 3.6.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13e7eccf51f5d9d11e1fc349773e81db85eac36b --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:023fcb9c596c99c5e8d74320f9720621834918ec3bcd5d877b44b0fe0907ce2e +size 169741912 diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-100/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-100/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..65f7cd319648121f105b20714d692f24f6414140 --- /dev/null +++ b/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a0d76b0ebb45ec68a37d642d7342c66a7ebc9bc3239f3387972226f24509e56 +size 169741912 diff --git a/checkpoint-100/chat_template.jinja b/checkpoint-100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-100/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b875bef738c81236b17817d1cf9a749f98fc8bef --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ebe97922ef0bee5a2887cb2ee8f12595764d517de7176ed003caf71939844df +size 71463733 diff --git a/checkpoint-100/processor_config.json b/checkpoint-100/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-100/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..35089dbf59de1bc505764378fda1dbc247fe0d6b --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfa39a08ca6ca0b25c44556fe7464362808ae67fd00d1432e1130777acac8674 +size 1465 diff --git a/checkpoint-100/tokenizer.json b/checkpoint-100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d86d0125e0eee9e10ca1d4c1419f560117f3bbab --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,182 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.018195050946142648, + "eval_steps": 100, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.444622973392128e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..bc886230771b1d9c8f306c7a4b9b3c7960936750 --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:195f79601dec1ad668a414b5c045319cec84f48961f45b7d32762f86750cd8b1 +size 5777 diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e6ea49cef3e00abb142b0eeedfbae9c372378b5d --- /dev/null +++ b/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f94c7dd4d79ecdb435c295a616d4707c2bf0e734fbefe7d10ecfa59b195ee625 +size 169741912 diff --git a/checkpoint-1000/chat_template.jinja b/checkpoint-1000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1000/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5db9f22bf2e772e6511dedec2f5297297df9802b --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:795a63e9a73654a7dd8a4dac66a5a2b305d11f32784400415681ec19ef91f007 +size 72807355 diff --git a/checkpoint-1000/processor_config.json b/checkpoint-1000/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1000/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dd592c118762fb5e01051201291cfbb5392dbcd --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:406994c2cf2acc1e48ce8857e7cbb9e95d4fab92a97bbe36f71721705be347d7 +size 1465 diff --git a/checkpoint-1000/tokenizer.json b/checkpoint-1000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9c3d8304c154fda794f591a45f133637c71e7699 --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,1442 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1819505094614265, + "eval_steps": 100, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.583006871819799e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1100/README.md b/checkpoint-1100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1100/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1100/adapter_config.json b/checkpoint-1100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1100/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1100/adapter_model.safetensors b/checkpoint-1100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cb2647dbe1250de395963cb8f8ad1ab34e3f03be --- /dev/null +++ b/checkpoint-1100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4be3bea2ca3bd38e446c68a30717eb1a31d7d5b77955efe33bf656a8162068a +size 169741912 diff --git a/checkpoint-1100/chat_template.jinja b/checkpoint-1100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1100/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1100/optimizer.pt b/checkpoint-1100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f14357495e786417a1152b2baa822b3169f33375 --- /dev/null +++ b/checkpoint-1100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66120ce4d55186cce9be5cdf28e030e89994c81dac5711321d07d2b5ce8153e3 +size 72807355 diff --git a/checkpoint-1100/processor_config.json b/checkpoint-1100/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1100/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1100/rng_state.pth b/checkpoint-1100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1100/scheduler.pt b/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a30411f3b56e549bf9d0e3fcfc041ba0aea0119e --- /dev/null +++ b/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:936724e73ecd7ecf26460f7aeb2b5af5460899f93c78695a46fc00c541454d94 +size 1465 diff --git a/checkpoint-1100/tokenizer.json b/checkpoint-1100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1100/tokenizer_config.json b/checkpoint-1100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1100/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1100/trainer_state.json b/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..fe7549052c2122f500081edda24a93eabfecd1ac --- /dev/null +++ b/checkpoint-1100/trainer_state.json @@ -0,0 +1,1582 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.20014556040756915, + "eval_steps": 100, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.127484770153037e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1100/training_args.bin b/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1200/README.md b/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1200/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1200/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1200/adapter_model.safetensors b/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..292d935185c8ac00919d157f9ff64a889d356961 --- /dev/null +++ b/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:758b7e5c64f7b3b9a2dfb7f9c3f402266b67013f70427ae941acb07350f0c694 +size 169741912 diff --git a/checkpoint-1200/chat_template.jinja b/checkpoint-1200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1200/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f13dac5f0b23147b58f5cda5e2c132b30fb971b --- /dev/null +++ b/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e1e647229ebd58f619f9224174e6d5fab90526935a57bf68b5a5fbc119fb909 +size 72807355 diff --git a/checkpoint-1200/processor_config.json b/checkpoint-1200/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1200/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea3b11a51b0e81378736f4c09e20d923f2a9c07c --- /dev/null +++ b/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efcf962131305188aae5d8c42fb21f39c330e15fc73bc76b4411e357b0d01cee +size 1465 diff --git a/checkpoint-1200/tokenizer.json b/checkpoint-1200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dad094a6a4be8ecdf658f8f1f7edf3308e22b870 --- /dev/null +++ b/checkpoint-1200/trainer_state.json @@ -0,0 +1,1722 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2183406113537118, + "eval_steps": 100, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.681503343752571e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1300/README.md b/checkpoint-1300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1300/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1300/adapter_config.json b/checkpoint-1300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1300/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1300/adapter_model.safetensors b/checkpoint-1300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07c4583f14fc07208b1f75ebd0101b70546e0f20 --- /dev/null +++ b/checkpoint-1300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21824c6a8f5083ba2221748ebd811b7a9fc6278660a9f4521ad7824fdcbb2c6 +size 169741912 diff --git a/checkpoint-1300/chat_template.jinja b/checkpoint-1300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1300/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1300/optimizer.pt b/checkpoint-1300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c44d446e2fdb1bf1a1b53cd353392a5325a742cc --- /dev/null +++ b/checkpoint-1300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5175257136617c2ec9a887b2cd454032ebf3146bc0c36e17cbd121d9793eadfc +size 72807355 diff --git a/checkpoint-1300/processor_config.json b/checkpoint-1300/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1300/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1300/rng_state.pth b/checkpoint-1300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1300/scheduler.pt b/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7b8971608fe833cc9e47f0e552ebb0c2d967871 --- /dev/null +++ b/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3a92a0fa04e93d4646a22102148247efeafa7bddd78a00b75c6f94617b15fd2 +size 1465 diff --git a/checkpoint-1300/tokenizer.json b/checkpoint-1300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1300/tokenizer_config.json b/checkpoint-1300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1300/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1300/trainer_state.json b/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d6b247df7d42d81cf64caf66e08e88fbff8e386b --- /dev/null +++ b/checkpoint-1300/trainer_state.json @@ -0,0 +1,1862 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.23653566229985443, + "eval_steps": 100, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.228400527632223e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1300/training_args.bin b/checkpoint-1300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1400/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1400/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1400/adapter_model.safetensors b/checkpoint-1400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4f01d4e06af34c6612821e55a7eb012b178477cf --- /dev/null +++ b/checkpoint-1400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cb28cfb399618f61de528c45e4399799c020a48d356ee719d582cf526507da2 +size 169741912 diff --git a/checkpoint-1400/chat_template.jinja b/checkpoint-1400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1400/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e8f193f386581ef2e412324e5c1e4dfd8be8331 --- /dev/null +++ b/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4e89c6629846413da997174e8f2034125889039290a4be48aed11dbe5707be8 +size 72807355 diff --git a/checkpoint-1400/processor_config.json b/checkpoint-1400/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1400/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1400/rng_state.pth b/checkpoint-1400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3a541ea94708ae9b1d6e581389ef1d1fc95c392 --- /dev/null +++ b/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857af1b441f656602733aca456e3a743a89384ac648ee3fd3489a940ab95b523 +size 1465 diff --git a/checkpoint-1400/tokenizer.json b/checkpoint-1400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1400/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4c6f04641ac09dbac246ba5e604d79b75a56b651 --- /dev/null +++ b/checkpoint-1400/trainer_state.json @@ -0,0 +1,2002 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2547307132459971, + "eval_steps": 100, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.772071633538204e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1500/README.md b/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1500/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e28903f8dde41ae0f08b4ffd11e175d1d7bbd83c --- /dev/null +++ b/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69d60343b506ad0c943d74eadd22f650e82ab11cb475ba6ad3c72df2549e3b00 +size 169741912 diff --git a/checkpoint-1500/chat_template.jinja b/checkpoint-1500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1500/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c11ad1abb065b3e9c4ecd72e817201e14359910e --- /dev/null +++ b/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52868d6f8fcaa503d24389933d16a144ef1ca33b160fb8bb4f5a732b8c520ddf +size 72807355 diff --git a/checkpoint-1500/processor_config.json b/checkpoint-1500/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1500/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e413bcf8f4605605003c34039a363e44b27c2e96 --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3af2dbb852cf80ef122cfd6295dff1ecfd87bd99ba018f26d2f07667a9aed01b +size 1465 diff --git a/checkpoint-1500/tokenizer.json b/checkpoint-1500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1500/tokenizer_config.json b/checkpoint-1500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1500/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f25f35a90ae22eae5224df099467d13d296c40bc --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,2142 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.27292576419213976, + "eval_steps": 100, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.315576795670244e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1600/README.md b/checkpoint-1600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1600/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1600/adapter_config.json b/checkpoint-1600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1600/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1600/adapter_model.safetensors b/checkpoint-1600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b02bce2ee44b6a015a92b9e4cf802d8f302974eb --- /dev/null +++ b/checkpoint-1600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d76afc5ff79e9f9418fab256c9529de31a72b27cd40750f03f0fa62717eb9285 +size 169741912 diff --git a/checkpoint-1600/chat_template.jinja b/checkpoint-1600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1600/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1600/optimizer.pt b/checkpoint-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..64c17bc27169a6c1b414ce1b23f6c4ab528046c1 --- /dev/null +++ b/checkpoint-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:044e1ca71e64bd33573cf03130723b7c3498a1381191791bc0ac6a53d0f0169f +size 72807355 diff --git a/checkpoint-1600/processor_config.json b/checkpoint-1600/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1600/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1600/rng_state.pth b/checkpoint-1600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8cf0a25e31300fb36150cbf31c849bdb3152739 --- /dev/null +++ b/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3407cdafc385e5f27b4133c663904e78499531174c286753c8d22a6075323095 +size 1465 diff --git a/checkpoint-1600/tokenizer.json b/checkpoint-1600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1600/tokenizer_config.json b/checkpoint-1600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1600/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e05cf51f3434ca164e2fea211d7366031b688eca --- /dev/null +++ b/checkpoint-1600/trainer_state.json @@ -0,0 +1,2282 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.29112081513828236, + "eval_steps": 100, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.860485929468659e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1600/training_args.bin b/checkpoint-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1700/README.md b/checkpoint-1700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1700/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1700/adapter_config.json b/checkpoint-1700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1700/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1700/adapter_model.safetensors b/checkpoint-1700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4fabe07009ae70f5b0cb414813e8df5f42ed1a68 --- /dev/null +++ b/checkpoint-1700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc93d16691d0e3cdc725a1d2e6b92ff9081c08a109445bcc51a63ae59ccca77 +size 169741912 diff --git a/checkpoint-1700/chat_template.jinja b/checkpoint-1700/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1700/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1700/optimizer.pt b/checkpoint-1700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b285e977f72411d65e135e9e447e5fe5e9c72182 --- /dev/null +++ b/checkpoint-1700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e035c7f0b6983e93185d1768718559dd5f3716bce8fa1ef79cff194aa53d17e8 +size 72807355 diff --git a/checkpoint-1700/processor_config.json b/checkpoint-1700/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1700/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1700/rng_state.pth b/checkpoint-1700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1700/scheduler.pt b/checkpoint-1700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6434a76d4d87812a0731572c67f178ae6865ae7 --- /dev/null +++ b/checkpoint-1700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d443ebce6b9e1fa228706e611e162a7398618a90c3923fa065457abec2fe8fa0 +size 1465 diff --git a/checkpoint-1700/tokenizer.json b/checkpoint-1700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1700/tokenizer_config.json b/checkpoint-1700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1700/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1700/trainer_state.json b/checkpoint-1700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1f22bbf02fd2298130e2bde7a2667499990b6d6f --- /dev/null +++ b/checkpoint-1700/trainer_state.json @@ -0,0 +1,2422 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.309315866084425, + "eval_steps": 100, + "global_step": 1700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.409120064297925e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1700/training_args.bin b/checkpoint-1700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1800/README.md b/checkpoint-1800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1800/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1800/adapter_config.json b/checkpoint-1800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1800/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1800/adapter_model.safetensors b/checkpoint-1800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0fa05a14273bda432c5109b78eebe99a26ebb547 --- /dev/null +++ b/checkpoint-1800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37be58b37107e3e073fc6a7989de8b112a42eb3fe45ff7a948df543599fbb4d5 +size 169741912 diff --git a/checkpoint-1800/chat_template.jinja b/checkpoint-1800/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1800/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1800/optimizer.pt b/checkpoint-1800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b5376b2aff9b0e5fedb49cb8d9ec251350c341c --- /dev/null +++ b/checkpoint-1800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f2e48dd5170d692b732527d1d8d71c793dc18b8a4688a937ac30ddb8a190278 +size 72807355 diff --git a/checkpoint-1800/processor_config.json b/checkpoint-1800/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1800/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1800/rng_state.pth b/checkpoint-1800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1800/scheduler.pt b/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1731e382022e615ee7d7c054b92df874a8e611dc --- /dev/null +++ b/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5929f0ded68cadf5a903af70646ae01b592f23d0cf488dad2297141251892d69 +size 1465 diff --git a/checkpoint-1800/tokenizer.json b/checkpoint-1800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1800/tokenizer_config.json b/checkpoint-1800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1800/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1800/trainer_state.json b/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1936a2517897358c32b61d00cbb13aaef1541746 --- /dev/null +++ b/checkpoint-1800/trainer_state.json @@ -0,0 +1,2562 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.32751091703056767, + "eval_steps": 100, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.961482475364106e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1800/training_args.bin b/checkpoint-1800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-1900/README.md b/checkpoint-1900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-1900/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-1900/adapter_config.json b/checkpoint-1900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-1900/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1900/adapter_model.safetensors b/checkpoint-1900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b87aa3fe7fa719d6a7eec1f610f38bce67e0598b --- /dev/null +++ b/checkpoint-1900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b14ba9786290b8547d2316a4207455e4bf529521963087b4c7b5834a1d5c685 +size 169741912 diff --git a/checkpoint-1900/chat_template.jinja b/checkpoint-1900/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-1900/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-1900/optimizer.pt b/checkpoint-1900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..edbf499a2d140d165acb1b01d5b97b20c8464d13 --- /dev/null +++ b/checkpoint-1900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2292d63b15aac6e7462fa0c28e7be8610fd06b587c5bd1ade3e75c7ee94893fe +size 72807355 diff --git a/checkpoint-1900/processor_config.json b/checkpoint-1900/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-1900/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-1900/rng_state.pth b/checkpoint-1900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-1900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-1900/scheduler.pt b/checkpoint-1900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e10b8078fa966f09bfa866fbaf6f2f4ef4f5b64 --- /dev/null +++ b/checkpoint-1900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eec1b8c04b4aafb9d7ce16df1301b79becce4e5c3d0741b708efa6172330081 +size 1465 diff --git a/checkpoint-1900/tokenizer.json b/checkpoint-1900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-1900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-1900/tokenizer_config.json b/checkpoint-1900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-1900/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-1900/trainer_state.json b/checkpoint-1900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d0f35e0eabd94d7afe9f044962fde323088a9d14 --- /dev/null +++ b/checkpoint-1900/trainer_state.json @@ -0,0 +1,2702 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3457059679767103, + "eval_steps": 100, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0510934319974177e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1900/training_args.bin b/checkpoint-1900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-1900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0dc859761af4bdca3b87b9a6f46266d791b01f4b --- /dev/null +++ b/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f1fb5bfacc31fdb717a86263aba4bc317f49b0e0a622c411c81f51589ab46f +size 169741912 diff --git a/checkpoint-200/chat_template.jinja b/checkpoint-200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-200/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b389c40b520cb3c7eb73d38a7cc1a3480f842e8e --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b91829c7d08e4d7ec6bef21d1366610edf9b4f5e9f4bf99e1af4fc24452e78 +size 72806843 diff --git a/checkpoint-200/processor_config.json b/checkpoint-200/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-200/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..66e6004136ff395821fc96a23576cb2c57bb9aa8 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29729914fcd289b6cd7b8d01f022641caf784664fb47ca2f2f1100dd2c24307d +size 1465 diff --git a/checkpoint-200/tokenizer.json b/checkpoint-200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a042aff091ff3ba7fcfe0dbd36c3e04b12b96aae --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,322 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.036390101892285295, + "eval_steps": 100, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1939621783048986e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2000/README.md b/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2000/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2000/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2000/adapter_model.safetensors b/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c97597f7537f1c06b48b6d5221ec8a500bec04fb --- /dev/null +++ b/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e5237ec64de13e87d4dce23b236cab89d80e71d67d7620797822a866a7babc9 +size 169741912 diff --git a/checkpoint-2000/chat_template.jinja b/checkpoint-2000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2000/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ca8c73b498ae9dc0746f2a44ff42293c83ac5830 --- /dev/null +++ b/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fa24a3e158860666ac3f2fbedcfa62984113fb8045909bf0a62c4d972f2137b +size 72807355 diff --git a/checkpoint-2000/processor_config.json b/checkpoint-2000/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2000/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..731d6355192a1d6d2f4f05e83f27d99381e7ff35 --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f5bdc1a9515599586f406cb9dd374c8e2782d4f3f12557a0e6fc81a835534f4 +size 1465 diff --git a/checkpoint-2000/tokenizer.json b/checkpoint-2000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5f823793b437ad32b488441ca957816107dee2ed --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,2842 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.363901018922853, + "eval_steps": 100, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1054573765554867e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2100/README.md b/checkpoint-2100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2100/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2100/adapter_config.json b/checkpoint-2100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2100/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2100/adapter_model.safetensors b/checkpoint-2100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..89824870012385a7445185a0b242b10503d88ec5 --- /dev/null +++ b/checkpoint-2100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd22d1447908436667437e7214461acb5b3fecd1084ef74d79e006b5bb7bb6fa +size 169741912 diff --git a/checkpoint-2100/chat_template.jinja b/checkpoint-2100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2100/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2100/optimizer.pt b/checkpoint-2100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bfde60777d9c71277f32f6310cda5d2ea5900ed --- /dev/null +++ b/checkpoint-2100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1092c81c46128e4e68ee858b1452a206901141ef5ee9e216e16ac1ba432eb3fa +size 72807355 diff --git a/checkpoint-2100/processor_config.json b/checkpoint-2100/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2100/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2100/rng_state.pth b/checkpoint-2100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2100/scheduler.pt b/checkpoint-2100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..95f472f2e9300a031d1dd6de2fef1d5198b78098 --- /dev/null +++ b/checkpoint-2100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d9bf6b09354aeeffdfb2ee007f3a492486af8a031630a2b864b38b951b063c1 +size 1465 diff --git a/checkpoint-2100/tokenizer.json b/checkpoint-2100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2100/tokenizer_config.json b/checkpoint-2100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2100/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2100/trainer_state.json b/checkpoint-2100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b146e2b8814610fd03f0683a40290ec83d81f0c6 --- /dev/null +++ b/checkpoint-2100/trainer_state.json @@ -0,0 +1,2982 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.38209606986899564, + "eval_steps": 100, + "global_step": 2100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1604275617561206e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2100/training_args.bin b/checkpoint-2100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2200/README.md b/checkpoint-2200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2200/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2200/adapter_config.json b/checkpoint-2200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2200/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2200/adapter_model.safetensors b/checkpoint-2200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2bc9977e54d6c2cffd17bbb6cb4b29a89bc532ed --- /dev/null +++ b/checkpoint-2200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09661f96fafc76014b129e07d70d157fea9859b9d63a2fa6e24ff566fb9fead +size 169741912 diff --git a/checkpoint-2200/chat_template.jinja b/checkpoint-2200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2200/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2200/optimizer.pt b/checkpoint-2200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1f333f1be8fdf2569d73e2cf85875c11a8cff165 --- /dev/null +++ b/checkpoint-2200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5008d914373b4eedcadd9daa42d0af31703fd4ddb9c587dbfe773f8990b0a7ec +size 72807355 diff --git a/checkpoint-2200/processor_config.json b/checkpoint-2200/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2200/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2200/rng_state.pth b/checkpoint-2200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2200/scheduler.pt b/checkpoint-2200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc875bf98e117b4166aa25022db47f1003196ac7 --- /dev/null +++ b/checkpoint-2200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7606597a533127bd5f869473a397899878377d8d2e89fd35dcd20b36560b18 +size 1465 diff --git a/checkpoint-2200/tokenizer.json b/checkpoint-2200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2200/tokenizer_config.json b/checkpoint-2200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2200/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2200/trainer_state.json b/checkpoint-2200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9acce07e33434b7356e60af62f0035bb79c99c49 --- /dev/null +++ b/checkpoint-2200/trainer_state.json @@ -0,0 +1,3122 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4002911208151383, + "eval_steps": 100, + "global_step": 2200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.215214662937741e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2200/training_args.bin b/checkpoint-2200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2300/README.md b/checkpoint-2300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2300/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2300/adapter_config.json b/checkpoint-2300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2300/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2300/adapter_model.safetensors b/checkpoint-2300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9cf89c45643ab80f9c5df5c58b97003229ccccb7 --- /dev/null +++ b/checkpoint-2300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e14803de733438d76bcf1a7df87de4d28f4de3fdd96c32248bc74a90cf182a62 +size 169741912 diff --git a/checkpoint-2300/chat_template.jinja b/checkpoint-2300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2300/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2300/optimizer.pt b/checkpoint-2300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..029f122f22dcc59a9bcd5b9675be3fed281d17d1 --- /dev/null +++ b/checkpoint-2300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b81919ff41dce57dd507f064720073b68b2521c73bf4075c5bf87bc504864950 +size 72807355 diff --git a/checkpoint-2300/processor_config.json b/checkpoint-2300/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2300/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2300/rng_state.pth b/checkpoint-2300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2300/scheduler.pt b/checkpoint-2300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a725a52fc0df173be64487a37eea58eb300f6cd --- /dev/null +++ b/checkpoint-2300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5420b6c93e7edddec2910b70fdf481953abc7fb5f197a0f487e43331d572974b +size 1465 diff --git a/checkpoint-2300/tokenizer.json b/checkpoint-2300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2300/tokenizer_config.json b/checkpoint-2300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2300/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2300/trainer_state.json b/checkpoint-2300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0d82063abdb1fc65a5c80de19cc5bcb352a46a65 --- /dev/null +++ b/checkpoint-2300/trainer_state.json @@ -0,0 +1,3262 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.41848617176128095, + "eval_steps": 100, + "global_step": 2300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2705431556818465e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2300/training_args.bin b/checkpoint-2300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2400/README.md b/checkpoint-2400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2400/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2400/adapter_config.json b/checkpoint-2400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2400/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2400/adapter_model.safetensors b/checkpoint-2400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ed54a7e89bf39570e891469479025873aef777c8 --- /dev/null +++ b/checkpoint-2400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34f96387bf15e2e5e186883b276e153549328fab909434291857087619cbd064 +size 169741912 diff --git a/checkpoint-2400/chat_template.jinja b/checkpoint-2400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2400/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2400/optimizer.pt b/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0eaf1a4d12f89e361c04e94ae83a3da8f1934b2 --- /dev/null +++ b/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb749e1e610c077e8f9950c8973644a4698f1b479184c93919d17ee94df26ce2 +size 72807355 diff --git a/checkpoint-2400/processor_config.json b/checkpoint-2400/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2400/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2400/rng_state.pth b/checkpoint-2400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2400/scheduler.pt b/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..12bc88601ab56223224a6763ab10f53be2457274 --- /dev/null +++ b/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eba492d1342c7608d70619516bfa6a83de7dd0de8ddd7c8fc9b80f417698ce96 +size 1465 diff --git a/checkpoint-2400/tokenizer.json b/checkpoint-2400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2400/tokenizer_config.json b/checkpoint-2400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2400/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2400/trainer_state.json b/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0faf26591fae2e72213066fe77c2a04a380bd1cb --- /dev/null +++ b/checkpoint-2400/trainer_state.json @@ -0,0 +1,3402 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4366812227074236, + "eval_steps": 100, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.325162457139422e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2400/training_args.bin b/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2500/README.md b/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2500/adapter_config.json b/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2500/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2500/adapter_model.safetensors b/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..39cf0f5764581e39fa89250a439c5a4909fa0a3c --- /dev/null +++ b/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3eef1d2c60b4e9acabb5b2f6950ae95b7f548fc3090dad4828f252e4d574a14 +size 169741912 diff --git a/checkpoint-2500/chat_template.jinja b/checkpoint-2500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2500/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2500/optimizer.pt b/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7449ab4934dd9d4922d73666d75f35e90001c1a4 --- /dev/null +++ b/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6d3e13dfdbf076cf85af2f2af9389ebb2acf56cf15cf373ae51af80ea2dab7a +size 72807355 diff --git a/checkpoint-2500/processor_config.json b/checkpoint-2500/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2500/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2500/rng_state.pth b/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2500/scheduler.pt b/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..569a93c33519ee27bc9e3719cfc9b1ab78eeeb19 --- /dev/null +++ b/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e57b03658c31690f8603ab1d1d5ff52fdb21cb504d78cc2e5d0024447a0df7 +size 1465 diff --git a/checkpoint-2500/tokenizer.json b/checkpoint-2500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2500/tokenizer_config.json b/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2500/trainer_state.json b/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c6a8661d2d1c4cd899d743e50a524509a027062a --- /dev/null +++ b/checkpoint-2500/trainer_state.json @@ -0,0 +1,3542 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.45487627365356625, + "eval_steps": 100, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3799337281583967e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2500/training_args.bin b/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2600/README.md b/checkpoint-2600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2600/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2600/adapter_config.json b/checkpoint-2600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2600/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2600/adapter_model.safetensors b/checkpoint-2600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d0db80ad884feee4cb079c5950b4bdb9ae1ea243 --- /dev/null +++ b/checkpoint-2600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f173453c4a8f79d4c764bf9dfe079b46c3d61ff14c11f0f4434bd53560c47a67 +size 169741912 diff --git a/checkpoint-2600/chat_template.jinja b/checkpoint-2600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2600/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2600/optimizer.pt b/checkpoint-2600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..da55ef4e8f962be5c826a51ba8054c545d28f883 --- /dev/null +++ b/checkpoint-2600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f51cc35a25c1ab7f195b458b4d5a61a4c9a4f62dcbc247421ea1828376819ed7 +size 72807355 diff --git a/checkpoint-2600/processor_config.json b/checkpoint-2600/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2600/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2600/rng_state.pth b/checkpoint-2600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2600/scheduler.pt b/checkpoint-2600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..07e2637406963f0ce5a57ced3c79ae42c7b38062 --- /dev/null +++ b/checkpoint-2600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaab7036ca9f4229b7c51e6e51d9bf8345f474742af924e02c88c4478ec5f987 +size 1465 diff --git a/checkpoint-2600/tokenizer.json b/checkpoint-2600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2600/tokenizer_config.json b/checkpoint-2600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2600/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2600/trainer_state.json b/checkpoint-2600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..98e3122b2b9e8c6182f1f7bfa646ab302adeb85b --- /dev/null +++ b/checkpoint-2600/trainer_state.json @@ -0,0 +1,3682 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.47307132459970885, + "eval_steps": 100, + "global_step": 2600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.434562636887095e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2600/training_args.bin b/checkpoint-2600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2700/README.md b/checkpoint-2700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2700/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2700/adapter_config.json b/checkpoint-2700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2700/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2700/adapter_model.safetensors b/checkpoint-2700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..203992c22dcd71e4ca8c03add6274511415a9c58 --- /dev/null +++ b/checkpoint-2700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5228b3e1f8e018ba523a5a58ecc10a22b9a43c4013b47bcbff970a2bb3659f41 +size 169741912 diff --git a/checkpoint-2700/chat_template.jinja b/checkpoint-2700/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2700/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2700/optimizer.pt b/checkpoint-2700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..685f98980bb4ccd9a276b3f51ce543a8f682776d --- /dev/null +++ b/checkpoint-2700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4bd0f7cf5bad80e0365b13b8cffad014af469876163af9316ef3ed54d554656 +size 72807355 diff --git a/checkpoint-2700/processor_config.json b/checkpoint-2700/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2700/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2700/rng_state.pth b/checkpoint-2700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2700/scheduler.pt b/checkpoint-2700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1368bc7697b897a75c8ff7e6470c07f1fb933def --- /dev/null +++ b/checkpoint-2700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7f25b53574ff484cfa756a6e440f8376a502b769885540ab8e7db003faf2b7a +size 1465 diff --git a/checkpoint-2700/tokenizer.json b/checkpoint-2700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2700/tokenizer_config.json b/checkpoint-2700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2700/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2700/trainer_state.json b/checkpoint-2700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e2457148503f2ee163a1aec96a13cf71b4e4d3e --- /dev/null +++ b/checkpoint-2700/trainer_state.json @@ -0,0 +1,3822 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4912663755458515, + "eval_steps": 100, + "global_step": 2700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4891834667741673e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2700/training_args.bin b/checkpoint-2700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2800/README.md b/checkpoint-2800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2800/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2800/adapter_config.json b/checkpoint-2800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2800/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2800/adapter_model.safetensors b/checkpoint-2800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1b96fadfcb5707bab87751ed99b96dd227e3a365 --- /dev/null +++ b/checkpoint-2800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:975a596b0f9f20e20d0a121966ce95f9f0cc47ac7a3071454c651134ed2521c0 +size 169741912 diff --git a/checkpoint-2800/chat_template.jinja b/checkpoint-2800/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2800/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2800/optimizer.pt b/checkpoint-2800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f05633833a462e0fdae1e939af2dc0bd2cfc786 --- /dev/null +++ b/checkpoint-2800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5fafd57d1b93c1a7117bf468c29689eeab4e80ec6d969115ed4cb60c57ca13b +size 72807355 diff --git a/checkpoint-2800/processor_config.json b/checkpoint-2800/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2800/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2800/rng_state.pth b/checkpoint-2800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2800/scheduler.pt b/checkpoint-2800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..96a4e6609d42076027566db9feeed9130d0f2790 --- /dev/null +++ b/checkpoint-2800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bb4aacf6d545f189a555097c83dcbdc89c9069ccc831393243fe44af2f96596 +size 1465 diff --git a/checkpoint-2800/tokenizer.json b/checkpoint-2800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2800/tokenizer_config.json b/checkpoint-2800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2800/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2800/trainer_state.json b/checkpoint-2800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d20edad2fc0535646df507ffbcc9642f65dd5fe4 --- /dev/null +++ b/checkpoint-2800/trainer_state.json @@ -0,0 +1,3962 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5094614264919942, + "eval_steps": 100, + "global_step": 2800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.545047564883377e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2800/training_args.bin b/checkpoint-2800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-2900/README.md b/checkpoint-2900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-2900/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-2900/adapter_config.json b/checkpoint-2900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-2900/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2900/adapter_model.safetensors b/checkpoint-2900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e93ecc7dec5f1db01880fe9ec51835079156f03d --- /dev/null +++ b/checkpoint-2900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:732120270facc7d6d2d69d3ca0c49897f3ba3f1a3559d0b1f470be38475a79b6 +size 169741912 diff --git a/checkpoint-2900/chat_template.jinja b/checkpoint-2900/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-2900/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-2900/optimizer.pt b/checkpoint-2900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe4b6b047f623f02808df43aacb1626fbc0c0293 --- /dev/null +++ b/checkpoint-2900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a918f35832fbacc3db98ebed06ee510ed39bc3d87ee127fa994dbda8cb345bb +size 72807355 diff --git a/checkpoint-2900/processor_config.json b/checkpoint-2900/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-2900/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-2900/rng_state.pth b/checkpoint-2900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-2900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-2900/scheduler.pt b/checkpoint-2900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..88479698beda3260b0434a3c81dac58cc905a738 --- /dev/null +++ b/checkpoint-2900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f55cadcff16b609e6d7d899d6d85d51d6152dd63f70acbdd873e182531f819b0 +size 1465 diff --git a/checkpoint-2900/tokenizer.json b/checkpoint-2900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-2900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-2900/tokenizer_config.json b/checkpoint-2900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-2900/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-2900/trainer_state.json b/checkpoint-2900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d779da3a571fa96ac3973cdf26d413aad71c12c7 --- /dev/null +++ b/checkpoint-2900/trainer_state.json @@ -0,0 +1,4102 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5276564774381368, + "eval_steps": 100, + "global_step": 2900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.600169064604197e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2900/training_args.bin b/checkpoint-2900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-2900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-300/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-300/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7eb6dba203b6266f68fdc6f721ff4b83a6823a04 --- /dev/null +++ b/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aacf12521886db4beac302ec42b39ca34d003cd214d61ec1154b5d89fd8ce0e +size 169741912 diff --git a/checkpoint-300/chat_template.jinja b/checkpoint-300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-300/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cc6c15c0f60bd21339f80800867759afb17d443 --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dfbfc331e5f36533cbba9406d38c1df79cb1bdb6c5e3c6088e48faec99533e5 +size 72807355 diff --git a/checkpoint-300/processor_config.json b/checkpoint-300/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-300/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4fd9c25e3450ea06b62e8f786fb70859e6136672 --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ce927394996dcd38318599e573f8e5f6def23906b68897e863fdfe657ab241 +size 1465 diff --git a/checkpoint-300/tokenizer.json b/checkpoint-300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-300/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..738d41339059de7cc78d40936a8c94b41b577caf --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,462 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.05458515283842795, + "eval_steps": 100, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7433397849108378e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3000/README.md b/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3000/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3000/adapter_config.json b/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3000/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3000/adapter_model.safetensors b/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..91bf604436b870cb0685509b6f0bbcd7618b543d --- /dev/null +++ b/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:561130f2a2f932a2beb712641a1a0c8f7402406d273ef7184af992d24f288c70 +size 169741912 diff --git a/checkpoint-3000/chat_template.jinja b/checkpoint-3000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3000/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3000/optimizer.pt b/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..40640158123914b30fa2b8006f7d72b13ff779c9 --- /dev/null +++ b/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff954c41539e6202c1b840f66ad70716b5107dd46043d7deb64a6cddabcceb6b +size 72807355 diff --git a/checkpoint-3000/processor_config.json b/checkpoint-3000/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3000/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3000/rng_state.pth b/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3000/scheduler.pt b/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4e10983f12f2ee71fb90f299f3a40de6b072a5f --- /dev/null +++ b/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdd50cc977c9a153c6fd2866e786909af07ca646ecb6892b8cd2d8a1df02834c +size 1465 diff --git a/checkpoint-3000/tokenizer.json b/checkpoint-3000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3000/tokenizer_config.json b/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3000/trainer_state.json b/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aee470cbfa1d68b89c7eebf2bef32ba796067967 --- /dev/null +++ b/checkpoint-3000/trainer_state.json @@ -0,0 +1,4242 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5458515283842795, + "eval_steps": 100, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6546640174048517e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3000/training_args.bin b/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3100/README.md b/checkpoint-3100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3100/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3100/adapter_config.json b/checkpoint-3100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3100/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3100/adapter_model.safetensors b/checkpoint-3100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..84b92e3c2e5c807881034cfc23fd388b48bcdca4 --- /dev/null +++ b/checkpoint-3100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cf1a639191ca794ea4a0c6cc967246b64d61682f2942d49b2c15f9efa375139 +size 169741912 diff --git a/checkpoint-3100/chat_template.jinja b/checkpoint-3100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3100/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3100/optimizer.pt b/checkpoint-3100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..df513f92578eda8738ddeff751ad325fe7d5b473 --- /dev/null +++ b/checkpoint-3100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5ef6405a1424ab6e05360a6ae89bc28d3c772739dd148801543b8df622246e1 +size 72807355 diff --git a/checkpoint-3100/processor_config.json b/checkpoint-3100/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3100/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3100/rng_state.pth b/checkpoint-3100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3100/scheduler.pt b/checkpoint-3100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0bfdabc81daa78f9144105e4599edc5cbb241854 --- /dev/null +++ b/checkpoint-3100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d180b1c2dc0dee95fd0c0a838c18e51eb51269d25ce2b5a31707f5876e99f22 +size 1465 diff --git a/checkpoint-3100/tokenizer.json b/checkpoint-3100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3100/tokenizer_config.json b/checkpoint-3100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3100/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3100/trainer_state.json b/checkpoint-3100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6ad62e3a647bb7ebf5c9d5b2f78f3206e3654d71 --- /dev/null +++ b/checkpoint-3100/trainer_state.json @@ -0,0 +1,4382 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5640465793304221, + "eval_steps": 100, + "global_step": 3100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.70859825492724e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3100/training_args.bin b/checkpoint-3100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3200/README.md b/checkpoint-3200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3200/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3200/adapter_config.json b/checkpoint-3200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3200/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3200/adapter_model.safetensors b/checkpoint-3200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2c1c591b838821579e7bf8d85202b7d52b68eaf1 --- /dev/null +++ b/checkpoint-3200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32fb177fbe136ca043ed6035a43af0dd484d8fcda80671f1f5f4357be09052d5 +size 169741912 diff --git a/checkpoint-3200/chat_template.jinja b/checkpoint-3200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3200/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3200/optimizer.pt b/checkpoint-3200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bcda9c590a157673604ca0d5a969e5d4674132bb --- /dev/null +++ b/checkpoint-3200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c595669c7ffdaea388525bc80a51457a31a22b1735f005c778ba49ce7b80851 +size 72807355 diff --git a/checkpoint-3200/processor_config.json b/checkpoint-3200/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3200/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3200/rng_state.pth b/checkpoint-3200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3200/scheduler.pt b/checkpoint-3200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1fda2f44100f2f39146503b390bade53e8648468 --- /dev/null +++ b/checkpoint-3200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1485f639f1264de28af0debdb7af8d42b220fc574595895bd6d45b959ed11c9 +size 1465 diff --git a/checkpoint-3200/tokenizer.json b/checkpoint-3200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3200/tokenizer_config.json b/checkpoint-3200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3200/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3200/trainer_state.json b/checkpoint-3200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..60a0cdcf5851d6fce0b04bde66dbd209ad903699 --- /dev/null +++ b/checkpoint-3200/trainer_state.json @@ -0,0 +1,4522 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5822416302765647, + "eval_steps": 100, + "global_step": 3200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.7637015226676337e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3200/training_args.bin b/checkpoint-3200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3300/README.md b/checkpoint-3300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3300/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3300/adapter_config.json b/checkpoint-3300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3300/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3300/adapter_model.safetensors b/checkpoint-3300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9511bf5a0688db833bc47fbc607c9fde10e1f8d9 --- /dev/null +++ b/checkpoint-3300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1f146ba2327ee02f804685347ed602693fc59d0277708585446f16fe7f35b6 +size 169741912 diff --git a/checkpoint-3300/chat_template.jinja b/checkpoint-3300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3300/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3300/optimizer.pt b/checkpoint-3300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bf5f3f4d2dfa952cd7a0dd3bb57c95fff55d368 --- /dev/null +++ b/checkpoint-3300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc5320188332556e5821be0cd8607d3554c07d98ba5100a705efb88727a70bfd +size 72807355 diff --git a/checkpoint-3300/processor_config.json b/checkpoint-3300/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3300/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3300/rng_state.pth b/checkpoint-3300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3300/scheduler.pt b/checkpoint-3300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c1d54cfdf5d38e1b5df758ecefaf3ecd3487f9c --- /dev/null +++ b/checkpoint-3300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:061e9564ca1cd5e50b2a0842dab0cf4dbfb6db44af39595b8259d6c65c843209 +size 1465 diff --git a/checkpoint-3300/tokenizer.json b/checkpoint-3300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3300/tokenizer_config.json b/checkpoint-3300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3300/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3300/trainer_state.json b/checkpoint-3300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1b2d7d31f08c2829ce93a2ac937d93c96a44e155 --- /dev/null +++ b/checkpoint-3300/trainer_state.json @@ -0,0 +1,4662 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6004366812227074, + "eval_steps": 100, + "global_step": 3300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8176324849839572e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3300/training_args.bin b/checkpoint-3300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3400/README.md b/checkpoint-3400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3400/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3400/adapter_config.json b/checkpoint-3400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3400/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3400/adapter_model.safetensors b/checkpoint-3400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..81aca3510ccca4664b7b77c69a34f1bed6ac6a89 --- /dev/null +++ b/checkpoint-3400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f84d5e1dbf9adb3564ef0dd5855eda4e578dfb64526eab20ef967dcafa91de8c +size 169741912 diff --git a/checkpoint-3400/chat_template.jinja b/checkpoint-3400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3400/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3400/optimizer.pt b/checkpoint-3400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e2921c4e4fc064a5fbd73bd211b4b60fd103991 --- /dev/null +++ b/checkpoint-3400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2f3734b61c4f008d3375cf2af6c68eb9386d2a6832d9482b0ca22ab76e59a2 +size 72807355 diff --git a/checkpoint-3400/processor_config.json b/checkpoint-3400/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3400/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3400/rng_state.pth b/checkpoint-3400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3400/scheduler.pt b/checkpoint-3400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4398756d480b21edbcb76e68de5eb5295c55bcd2 --- /dev/null +++ b/checkpoint-3400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce5a00f8ff3c6602d68364babff15c75ed86345b24f966653e3d9d6e6f07629d +size 1465 diff --git a/checkpoint-3400/tokenizer.json b/checkpoint-3400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3400/tokenizer_config.json b/checkpoint-3400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3400/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3400/trainer_state.json b/checkpoint-3400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9113ed7d6c19bdf00eaededee5566c5d1d740ed1 --- /dev/null +++ b/checkpoint-3400/trainer_state.json @@ -0,0 +1,4802 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.61863173216885, + "eval_steps": 100, + "global_step": 3400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8722872678405663e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3400/training_args.bin b/checkpoint-3400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3500/README.md b/checkpoint-3500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3500/adapter_config.json b/checkpoint-3500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3500/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3500/adapter_model.safetensors b/checkpoint-3500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4872acf012f516316068a397a3e22f1a9523bc3c --- /dev/null +++ b/checkpoint-3500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:384574f4d1031e95902d07379d0ca65cef78f2a497ce8f88e0b5fb2abd9befc7 +size 169741912 diff --git a/checkpoint-3500/chat_template.jinja b/checkpoint-3500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3500/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3500/optimizer.pt b/checkpoint-3500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce30efb9345abceba0680da344fdfd0ad13520c1 --- /dev/null +++ b/checkpoint-3500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cceea90703d2831388e6837b34fe0ea3e8c37f98beaec341ba630ec47a73b69 +size 72807355 diff --git a/checkpoint-3500/processor_config.json b/checkpoint-3500/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3500/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3500/rng_state.pth b/checkpoint-3500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3500/scheduler.pt b/checkpoint-3500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4f2c71216c33756027cf1bc9233b471b10082886 --- /dev/null +++ b/checkpoint-3500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e7f5c829273d81c7d394275a2bb332278dfb03715ee3b6e4102dba4691e9d37 +size 1465 diff --git a/checkpoint-3500/tokenizer.json b/checkpoint-3500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3500/tokenizer_config.json b/checkpoint-3500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3500/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3500/trainer_state.json b/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1aef63ff4eaed2165e38c9eb070351d0a2896ad1 --- /dev/null +++ b/checkpoint-3500/trainer_state.json @@ -0,0 +1,4942 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6368267831149927, + "eval_steps": 100, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9271465327291443e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3500/training_args.bin b/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3600/README.md b/checkpoint-3600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3600/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3600/adapter_config.json b/checkpoint-3600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3600/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3600/adapter_model.safetensors b/checkpoint-3600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0672b63f57edd1965d6e90f60090756fe5f3a5ea --- /dev/null +++ b/checkpoint-3600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82f4db15171359b08051df7ef22b2a91759466d62f59e96fcc832280a756330 +size 169741912 diff --git a/checkpoint-3600/chat_template.jinja b/checkpoint-3600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3600/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3600/optimizer.pt b/checkpoint-3600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5da9402cf51e94b26a2ea76746e2f43e3c2a7173 --- /dev/null +++ b/checkpoint-3600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81f9806af70ac150ea697e8bf39f88750fb40b1e6ca60c895897b99d1e88c451 +size 72807355 diff --git a/checkpoint-3600/processor_config.json b/checkpoint-3600/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3600/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3600/rng_state.pth b/checkpoint-3600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3600/scheduler.pt b/checkpoint-3600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f95e80419dd9da35cffe755c298d5e917e06737 --- /dev/null +++ b/checkpoint-3600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8381d34f5354aa03b079a9399bfed585738434f75d919dd720618a74b99a1247 +size 1465 diff --git a/checkpoint-3600/tokenizer.json b/checkpoint-3600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3600/tokenizer_config.json b/checkpoint-3600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3600/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3600/trainer_state.json b/checkpoint-3600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..52d156c0fb6c0eb1fd3c734e8f74e68af254f1f8 --- /dev/null +++ b/checkpoint-3600/trainer_state.json @@ -0,0 +1,5082 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6550218340611353, + "eval_steps": 100, + "global_step": 3600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.981864745409872e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3600/training_args.bin b/checkpoint-3600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3700/README.md b/checkpoint-3700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3700/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3700/adapter_config.json b/checkpoint-3700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3700/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3700/adapter_model.safetensors b/checkpoint-3700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f30c6293c051d936a00e51a0a8df81a652d772f6 --- /dev/null +++ b/checkpoint-3700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da845288daa68c0affac00efd07f114f9dcc78e018847d986c15c1f79ec4d423 +size 169741912 diff --git a/checkpoint-3700/chat_template.jinja b/checkpoint-3700/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3700/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3700/optimizer.pt b/checkpoint-3700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea675ae4e682ce925f7efbb4471f23b91a233a2b --- /dev/null +++ b/checkpoint-3700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49556e185caef228272d3873c161ac51fb9fe4e142db4b7ca66d31e5543e911 +size 72807355 diff --git a/checkpoint-3700/processor_config.json b/checkpoint-3700/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3700/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3700/rng_state.pth b/checkpoint-3700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3700/scheduler.pt b/checkpoint-3700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2caef3ec140126df6ed83d8003b196d0eacd08e --- /dev/null +++ b/checkpoint-3700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e165369a022f727a5dc2e033515cbe349803d48dbbc9acf35607a611f0cf526 +size 1465 diff --git a/checkpoint-3700/tokenizer.json b/checkpoint-3700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3700/tokenizer_config.json b/checkpoint-3700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3700/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3700/trainer_state.json b/checkpoint-3700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..79d1a75319eca95c2e51a702ceff468cde6d277a --- /dev/null +++ b/checkpoint-3700/trainer_state.json @@ -0,0 +1,5222 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.673216885007278, + "eval_steps": 100, + "global_step": 3700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0364761863728922e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3700/training_args.bin b/checkpoint-3700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3800/README.md b/checkpoint-3800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3800/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3800/adapter_config.json b/checkpoint-3800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3800/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3800/adapter_model.safetensors b/checkpoint-3800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fd326176a9cbc79c72c49b2842b8cbc4e18bf908 --- /dev/null +++ b/checkpoint-3800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a3f03ada8e1ed15490b9cb47aeb536f3bb198392f1bd7a70f78da113ba68092 +size 169741912 diff --git a/checkpoint-3800/chat_template.jinja b/checkpoint-3800/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3800/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3800/optimizer.pt b/checkpoint-3800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..745b657cab9fd348ccd3ebb0bd5903c4ef06ee9d --- /dev/null +++ b/checkpoint-3800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4748594c024b04d6c6b59db5553f34bddad24d2fd2dc7fd1a33325f745c5fc20 +size 72807355 diff --git a/checkpoint-3800/processor_config.json b/checkpoint-3800/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3800/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3800/rng_state.pth b/checkpoint-3800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3800/scheduler.pt b/checkpoint-3800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dab54530d938cf24c40cb74d9e9d029117f1ef7a --- /dev/null +++ b/checkpoint-3800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d0b9d2db22198f8653b00ec482cfbb9290754de8025d23224966c69c5a07bc9 +size 1465 diff --git a/checkpoint-3800/tokenizer.json b/checkpoint-3800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3800/tokenizer_config.json b/checkpoint-3800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3800/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3800/trainer_state.json b/checkpoint-3800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5276e96084aef72b5e015d7228036587bd08a926 --- /dev/null +++ b/checkpoint-3800/trainer_state.json @@ -0,0 +1,5362 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6914119359534207, + "eval_steps": 100, + "global_step": 3800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0912782443288745e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3800/training_args.bin b/checkpoint-3800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-3900/README.md b/checkpoint-3900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-3900/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-3900/adapter_config.json b/checkpoint-3900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-3900/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-3900/adapter_model.safetensors b/checkpoint-3900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b1c4604f7b97d8f15ce3b6520f68580a8e21a217 --- /dev/null +++ b/checkpoint-3900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:930c154cc15967bdadde6e3a4e438cf417ae2f9ff8f99e23c659a4c10500f638 +size 169741912 diff --git a/checkpoint-3900/chat_template.jinja b/checkpoint-3900/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-3900/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-3900/optimizer.pt b/checkpoint-3900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6a5abecb5d413a2d1b28bb175cd6ed9cf91fabe --- /dev/null +++ b/checkpoint-3900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3832963bedc2ddbde6ce39d4c5512721b4e48d086d12cdec63cef88f57e37f5f +size 72807355 diff --git a/checkpoint-3900/processor_config.json b/checkpoint-3900/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-3900/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-3900/rng_state.pth b/checkpoint-3900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-3900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-3900/scheduler.pt b/checkpoint-3900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f4abc4618901a501ee3a382986ffeee15db63a0 --- /dev/null +++ b/checkpoint-3900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bc4a059b28299a8bc4683917a6a5cb6a7fd2550c2a08109f253cfa3584ebaae +size 1465 diff --git a/checkpoint-3900/tokenizer.json b/checkpoint-3900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-3900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-3900/tokenizer_config.json b/checkpoint-3900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-3900/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-3900/trainer_state.json b/checkpoint-3900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..30a48f9418583fbf44dbde1efce9bcebf59a78ba --- /dev/null +++ b/checkpoint-3900/trainer_state.json @@ -0,0 +1,5502 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7096069868995634, + "eval_steps": 100, + "global_step": 3900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.146429330077812e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-3900/training_args.bin b/checkpoint-3900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-3900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-400/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-400/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-400/adapter_model.safetensors b/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..83637b00f165a90605bcc0237b5733b736b01632 --- /dev/null +++ b/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5376434516ca8d29b90a6d57eb8dabcc28d57a2b4ee2686d6b396663c726ac03 +size 169741912 diff --git a/checkpoint-400/chat_template.jinja b/checkpoint-400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-400/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..baa99a9bad68f126e96ca47c559cb0f82638851b --- /dev/null +++ b/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05ae1fdde3f57b9ae0ee54b2f1db326f37e068e2a2c912cfe0b484bbd379453a +size 72807355 diff --git a/checkpoint-400/processor_config.json b/checkpoint-400/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-400/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1ad556e5b8b695a8493682eb5176a8a51ef3995 --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b91c3f06e07c33026be1365e870f031bb614ff69eebc663c24c46d29531e21d6 +size 1465 diff --git a/checkpoint-400/tokenizer.json b/checkpoint-400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-400/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a3fb93958cc243843e2afbc35bb31eff8a945ee7 --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,602 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.07278020378457059, + "eval_steps": 100, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2959893223753523e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4000/README.md b/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4000/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4000/adapter_config.json b/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4000/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4000/adapter_model.safetensors b/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..122225dd523bb4ebeaf0082ea3bcb92a6e1dac7a --- /dev/null +++ b/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f6d4a4c0f7f29a79c24ba03c70be0b9f9ad2322eaec2507a4bdd253e1877f3e +size 169741912 diff --git a/checkpoint-4000/chat_template.jinja b/checkpoint-4000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4000/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4000/optimizer.pt b/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..02df676d5405acbb1640036be3872d2e20f4f83a --- /dev/null +++ b/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:261af0e2bee0b8fe5b21d7b03a5d9a95b5648f50dc926d3ad176128963a89839 +size 72807355 diff --git a/checkpoint-4000/processor_config.json b/checkpoint-4000/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4000/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4000/rng_state.pth b/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4000/scheduler.pt b/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e45086845a384e4c8a98a875dc676b7e4dc576d --- /dev/null +++ b/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8e1aa2c2ec8acfeb7d7d0e346bde622c16805e36befbe0290006307b2751b20 +size 1465 diff --git a/checkpoint-4000/tokenizer.json b/checkpoint-4000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4000/tokenizer_config.json b/checkpoint-4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4000/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4000/trainer_state.json b/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7e82d6b94c578cbceba70578b6bfe9649017fcfa --- /dev/null +++ b/checkpoint-4000/trainer_state.json @@ -0,0 +1,5642 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.727802037845706, + "eval_steps": 100, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2009046316420787e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4000/training_args.bin b/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4100/README.md b/checkpoint-4100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4100/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4100/adapter_config.json b/checkpoint-4100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4100/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4100/adapter_model.safetensors b/checkpoint-4100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4a18403524e41012c589c271ad4dc6f05d9c738d --- /dev/null +++ b/checkpoint-4100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58539d3520353ef7548f3cb4afce746f3ecf3ab64a94b9beee75d776193ff575 +size 169741912 diff --git a/checkpoint-4100/chat_template.jinja b/checkpoint-4100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4100/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4100/optimizer.pt b/checkpoint-4100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3be23f7f70eba5099e26bbc4ac1ad6cb0317a3cb --- /dev/null +++ b/checkpoint-4100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daf64ecc2aea1438e939cefa2112beb45d33571013c1057b470fefbf0ceb1a15 +size 72807355 diff --git a/checkpoint-4100/processor_config.json b/checkpoint-4100/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4100/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4100/rng_state.pth b/checkpoint-4100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4100/scheduler.pt b/checkpoint-4100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0c29f060d9dd4277eeb4ecf92eb795e8c08d003 --- /dev/null +++ b/checkpoint-4100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70d73dcbe82614473137cbc421461dfb966bac186f60581edd01b1ecddd8cb5b +size 1465 diff --git a/checkpoint-4100/tokenizer.json b/checkpoint-4100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4100/tokenizer_config.json b/checkpoint-4100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4100/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4100/trainer_state.json b/checkpoint-4100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..90a7cc590216780c435cdf03afc86a636b100c7f --- /dev/null +++ b/checkpoint-4100/trainer_state.json @@ -0,0 +1,5782 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7459970887918487, + "eval_steps": 100, + "global_step": 4100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2550361457430938e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4100/training_args.bin b/checkpoint-4100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4200/README.md b/checkpoint-4200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4200/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4200/adapter_config.json b/checkpoint-4200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4200/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4200/adapter_model.safetensors b/checkpoint-4200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2570f36ac78878c6e97e690641f61c09a36908ba --- /dev/null +++ b/checkpoint-4200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8fdf9cab00c6fe8710b887249098c77545137bdf0cfbe89a2efce95eba18135 +size 169741912 diff --git a/checkpoint-4200/chat_template.jinja b/checkpoint-4200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4200/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4200/optimizer.pt b/checkpoint-4200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b547347b2b4646ecaa0f7228d245b95e5f955a9f --- /dev/null +++ b/checkpoint-4200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399886bf84ebe0ebecbc0b6e99ea015dc2c82c288b3ac7ae70f4140221e0b46c +size 72807355 diff --git a/checkpoint-4200/processor_config.json b/checkpoint-4200/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4200/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4200/rng_state.pth b/checkpoint-4200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4200/scheduler.pt b/checkpoint-4200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4d152871cc7296c135da6ad42c3ad6c0f28a4e7 --- /dev/null +++ b/checkpoint-4200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec927d5dd629391e1d3dbd2e680b7b7d117933dd3a6c4235cefa58c2e99af6d3 +size 1465 diff --git a/checkpoint-4200/tokenizer.json b/checkpoint-4200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4200/tokenizer_config.json b/checkpoint-4200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4200/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4200/trainer_state.json b/checkpoint-4200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..973c0fc4e3750f721099ebadee25c19a26dce1c1 --- /dev/null +++ b/checkpoint-4200/trainer_state.json @@ -0,0 +1,5922 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7641921397379913, + "eval_steps": 100, + "global_step": 4200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.310282758335583e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4200/training_args.bin b/checkpoint-4200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4300/README.md b/checkpoint-4300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4300/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4300/adapter_config.json b/checkpoint-4300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4300/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4300/adapter_model.safetensors b/checkpoint-4300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..21a3bfdf15ae89346be952eecbf49eaa9895b507 --- /dev/null +++ b/checkpoint-4300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eced2e5f1c6843a8db0df0ec5eb04b45b7692d56698e73044e8992c34df47313 +size 169741912 diff --git a/checkpoint-4300/chat_template.jinja b/checkpoint-4300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4300/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4300/optimizer.pt b/checkpoint-4300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4af328e6e08f11ea5ca478fb648c4889b691e981 --- /dev/null +++ b/checkpoint-4300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0008b10c3a319ad2cf110abf3c4ba5abf9cb83f10750b788ea1cdb1dabbe43e3 +size 72807355 diff --git a/checkpoint-4300/processor_config.json b/checkpoint-4300/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4300/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4300/rng_state.pth b/checkpoint-4300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4300/scheduler.pt b/checkpoint-4300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d209d60bdad40196a5d6cf52ef9fb8998ed35455 --- /dev/null +++ b/checkpoint-4300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5db332af5b43857b3570818d2a706d2392164182f86ae875d5e386c1f9ce788a +size 1465 diff --git a/checkpoint-4300/tokenizer.json b/checkpoint-4300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4300/tokenizer_config.json b/checkpoint-4300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4300/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4300/trainer_state.json b/checkpoint-4300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3c808868ab6b360268da6e3c09cf1ea9af405893 --- /dev/null +++ b/checkpoint-4300/trainer_state.json @@ -0,0 +1,6062 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7823871906841339, + "eval_steps": 100, + "global_step": 4300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.366369788804259e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4300/training_args.bin b/checkpoint-4300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4400/README.md b/checkpoint-4400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4400/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4400/adapter_config.json b/checkpoint-4400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4400/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4400/adapter_model.safetensors b/checkpoint-4400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..99eee17bdeaf130d77edeec2af4852d28a487d3e --- /dev/null +++ b/checkpoint-4400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:382f8773e3bb8d8b50a829e51e8adeb1f7677513e05f5adb129b1378b0790129 +size 169741912 diff --git a/checkpoint-4400/chat_template.jinja b/checkpoint-4400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4400/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4400/optimizer.pt b/checkpoint-4400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1afc56a4a896f70292ba012ab9e77158a78b525 --- /dev/null +++ b/checkpoint-4400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bea3fc6fed190bfcda9845af5cf6f5fe2704ce3e35d310eb436cf67ab8640ddc +size 72807355 diff --git a/checkpoint-4400/processor_config.json b/checkpoint-4400/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4400/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4400/rng_state.pth b/checkpoint-4400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4400/scheduler.pt b/checkpoint-4400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..db5f48848af387f93415b758bc879ec59e210237 --- /dev/null +++ b/checkpoint-4400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e602cd7d0b8af82ee04a51f3822660b9aefb6c22de1107ff520087edc92e07c8 +size 1465 diff --git a/checkpoint-4400/tokenizer.json b/checkpoint-4400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4400/tokenizer_config.json b/checkpoint-4400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4400/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4400/trainer_state.json b/checkpoint-4400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a0923cb7694dab7be7aae7c10e1adfd92f8e2f46 --- /dev/null +++ b/checkpoint-4400/trainer_state.json @@ -0,0 +1,6202 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8005822416302766, + "eval_steps": 100, + "global_step": 4400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.419746258733838e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4400/training_args.bin b/checkpoint-4400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4500/README.md b/checkpoint-4500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4500/adapter_config.json b/checkpoint-4500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4500/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4500/adapter_model.safetensors b/checkpoint-4500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e8ebcdcd711a3becbd00e8e12a4df02740e45b85 --- /dev/null +++ b/checkpoint-4500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a9669a0c97afe6dc5a4a3a2ca2d576054b2401ba19e0ab40447a9c9dacf454 +size 169741912 diff --git a/checkpoint-4500/chat_template.jinja b/checkpoint-4500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4500/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4500/optimizer.pt b/checkpoint-4500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fcc6547deb9ea802444bfa07c707a2dbf6f56b7 --- /dev/null +++ b/checkpoint-4500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:725a5212f8154f26c18f3bf6f21c99c3610460be8e36cd5b4258408fd9ce07b0 +size 72807355 diff --git a/checkpoint-4500/processor_config.json b/checkpoint-4500/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4500/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4500/rng_state.pth b/checkpoint-4500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4500/scheduler.pt b/checkpoint-4500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..92670746a2524edac6085d32fcfd8c64889ac055 --- /dev/null +++ b/checkpoint-4500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28135fc83e9e89f0b4f8eddf7929fecdeff05a9e15ba8b9f299bb0b694eae52b +size 1465 diff --git a/checkpoint-4500/tokenizer.json b/checkpoint-4500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4500/tokenizer_config.json b/checkpoint-4500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4500/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4500/trainer_state.json b/checkpoint-4500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..113f5c01f3ef91e41a4e2ca304aa2b0cf6a2357d --- /dev/null +++ b/checkpoint-4500/trainer_state.json @@ -0,0 +1,6342 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8187772925764192, + "eval_steps": 100, + "global_step": 4500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.473992405047116e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4500/training_args.bin b/checkpoint-4500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4600/README.md b/checkpoint-4600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4600/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4600/adapter_config.json b/checkpoint-4600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4600/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4600/adapter_model.safetensors b/checkpoint-4600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6d8ccfdd04a942ec9e97ccdac39d60a1fa198415 --- /dev/null +++ b/checkpoint-4600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f99b43fa7dd20c233bf68eca4c2431df06f09ab757a998e33fdecd1901d66069 +size 169741912 diff --git a/checkpoint-4600/chat_template.jinja b/checkpoint-4600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4600/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4600/optimizer.pt b/checkpoint-4600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b1cc838a7dc321de17bd5f345192fb3c38e94809 --- /dev/null +++ b/checkpoint-4600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccd702ea51b20495eba5844a3074ff4eb31559fb94b3b6d71f24ba8dff7299cd +size 72807355 diff --git a/checkpoint-4600/processor_config.json b/checkpoint-4600/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4600/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4600/rng_state.pth b/checkpoint-4600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4600/scheduler.pt b/checkpoint-4600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa3098c81e713df175dbe75e6e8112afb8211b31 --- /dev/null +++ b/checkpoint-4600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d632bb86d093612d43b72d8b08b04690461ffe2f2b2bc3dedcba090bfc88d928 +size 1465 diff --git a/checkpoint-4600/tokenizer.json b/checkpoint-4600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4600/tokenizer_config.json b/checkpoint-4600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4600/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4600/trainer_state.json b/checkpoint-4600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..65a3b117e759497d1514643514ef43df0ddf9ac0 --- /dev/null +++ b/checkpoint-4600/trainer_state.json @@ -0,0 +1,6482 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8369723435225619, + "eval_steps": 100, + "global_step": 4600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.529148621952221e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4600/training_args.bin b/checkpoint-4600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4700/README.md b/checkpoint-4700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4700/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4700/adapter_config.json b/checkpoint-4700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4700/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4700/adapter_model.safetensors b/checkpoint-4700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..705fdef1f7d1f50791858a03e6e3e3cc1cb33772 --- /dev/null +++ b/checkpoint-4700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40795c528c213a9d0c33604ca35fda29783ae39cf438979be0a0a9399f3f749f +size 169741912 diff --git a/checkpoint-4700/chat_template.jinja b/checkpoint-4700/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4700/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4700/optimizer.pt b/checkpoint-4700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6382636f2f7f8d21e409c7c01b88f8dec42f2ea --- /dev/null +++ b/checkpoint-4700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49875dd003bad3ded6c6745226845868f086532e0126eeb03e8fdfe2d00d524a +size 72807355 diff --git a/checkpoint-4700/processor_config.json b/checkpoint-4700/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4700/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4700/rng_state.pth b/checkpoint-4700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4700/scheduler.pt b/checkpoint-4700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..00cf0b221cced2313e6a1489c95f2debe3f3ffcc --- /dev/null +++ b/checkpoint-4700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8a0ab8f7a4ad7058c95af284f748e7f4487cc3c575a38242c2321938a3cd3e3 +size 1465 diff --git a/checkpoint-4700/tokenizer.json b/checkpoint-4700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4700/tokenizer_config.json b/checkpoint-4700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4700/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4700/trainer_state.json b/checkpoint-4700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6340bf480675fb0f5b7c50d1740bdef37eedc890 --- /dev/null +++ b/checkpoint-4700/trainer_state.json @@ -0,0 +1,6622 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8551673944687045, + "eval_steps": 100, + "global_step": 4700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.584293375636099e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4700/training_args.bin b/checkpoint-4700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4800/README.md b/checkpoint-4800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4800/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4800/adapter_config.json b/checkpoint-4800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4800/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4800/adapter_model.safetensors b/checkpoint-4800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3d0a1ba8062224b35070478c18c1ae754c7b13a4 --- /dev/null +++ b/checkpoint-4800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67b1d36e77867d8477a11fe1aaf8a2840d592a443782cc064829db4dfafb6a87 +size 169741912 diff --git a/checkpoint-4800/chat_template.jinja b/checkpoint-4800/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4800/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4800/optimizer.pt b/checkpoint-4800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce496aea57076dacb2e9da05d33f3c53ad7509d6 --- /dev/null +++ b/checkpoint-4800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a45cc82d4224e600cf448c802460c6ce98419f7fc4a1ea80c6bd6835f378e34 +size 72807355 diff --git a/checkpoint-4800/processor_config.json b/checkpoint-4800/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4800/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4800/rng_state.pth b/checkpoint-4800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4800/scheduler.pt b/checkpoint-4800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ec01cd25f540fcd1967eba137981512bb4614a3 --- /dev/null +++ b/checkpoint-4800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aedf70d9ea7ff9ec0e7c368bea8350cbcd293e65545950562608f0d2a10a4e0c +size 1465 diff --git a/checkpoint-4800/tokenizer.json b/checkpoint-4800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4800/tokenizer_config.json b/checkpoint-4800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4800/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4800/trainer_state.json b/checkpoint-4800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..68a3ec5e69df42f786cd9c4fb293a5d9cf426b0a --- /dev/null +++ b/checkpoint-4800/trainer_state.json @@ -0,0 +1,6762 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8733624454148472, + "eval_steps": 100, + "global_step": 4800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + }, + { + "epoch": 0.8560771470160117, + "grad_norm": 0.17895208299160004, + "learning_rate": 2.6965019219562155e-06, + "loss": 0.13971571922302245, + "step": 4705 + }, + { + "epoch": 0.8569868995633187, + "grad_norm": 0.22735828161239624, + "learning_rate": 2.6633455210614055e-06, + "loss": 0.13776102066040039, + "step": 4710 + }, + { + "epoch": 0.8578966521106259, + "grad_norm": 0.16779793798923492, + "learning_rate": 2.630382757333133e-06, + "loss": 0.14134042263031005, + "step": 4715 + }, + { + "epoch": 0.858806404657933, + "grad_norm": 0.2148888260126114, + "learning_rate": 2.597613916526637e-06, + "loss": 0.14680721759796142, + "step": 4720 + }, + { + "epoch": 0.8597161572052402, + "grad_norm": 0.16560257971286774, + "learning_rate": 2.565039282716045e-06, + "loss": 0.14137234687805175, + "step": 4725 + }, + { + "epoch": 0.8606259097525473, + "grad_norm": 0.16197068989276886, + "learning_rate": 2.532659138291879e-06, + "loss": 0.14969314336776735, + "step": 4730 + }, + { + "epoch": 0.8615356622998545, + "grad_norm": 0.14650246500968933, + "learning_rate": 2.5004737639586497e-06, + "loss": 0.13532910346984864, + "step": 4735 + }, + { + "epoch": 0.8624454148471615, + "grad_norm": 0.1565634310245514, + "learning_rate": 2.4684834387323943e-06, + "loss": 0.14146244525909424, + "step": 4740 + }, + { + "epoch": 0.8633551673944687, + "grad_norm": 0.18060864508152008, + "learning_rate": 2.4366884399382393e-06, + "loss": 0.14218534231185914, + "step": 4745 + }, + { + "epoch": 0.8642649199417758, + "grad_norm": 0.24613255262374878, + "learning_rate": 2.4050890432080557e-06, + "loss": 0.13907679319381713, + "step": 4750 + }, + { + "epoch": 0.865174672489083, + "grad_norm": 0.16036023199558258, + "learning_rate": 2.3736855224780057e-06, + "loss": 0.13718113899230958, + "step": 4755 + }, + { + "epoch": 0.86608442503639, + "grad_norm": 0.16678516566753387, + "learning_rate": 2.3424781499862075e-06, + "loss": 0.1327962040901184, + "step": 4760 + }, + { + "epoch": 0.8669941775836972, + "grad_norm": 0.1763770878314972, + "learning_rate": 2.3114671962703727e-06, + "loss": 0.14390318393707274, + "step": 4765 + }, + { + "epoch": 0.8679039301310044, + "grad_norm": 0.17735697329044342, + "learning_rate": 2.280652930165428e-06, + "loss": 0.15223288536071777, + "step": 4770 + }, + { + "epoch": 0.8688136826783115, + "grad_norm": 0.15827041864395142, + "learning_rate": 2.250035618801241e-06, + "loss": 0.14296332597732545, + "step": 4775 + }, + { + "epoch": 0.8697234352256187, + "grad_norm": 0.16876135766506195, + "learning_rate": 2.219615527600244e-06, + "loss": 0.1359076738357544, + "step": 4780 + }, + { + "epoch": 0.8706331877729258, + "grad_norm": 0.1800110638141632, + "learning_rate": 2.189392920275174e-06, + "loss": 0.1424281358718872, + "step": 4785 + }, + { + "epoch": 0.8715429403202329, + "grad_norm": 0.1409560889005661, + "learning_rate": 2.159368058826783e-06, + "loss": 0.14480490684509278, + "step": 4790 + }, + { + "epoch": 0.87245269286754, + "grad_norm": 0.1634288728237152, + "learning_rate": 2.129541203541535e-06, + "loss": 0.14513269662857056, + "step": 4795 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.17126062512397766, + "learning_rate": 2.099912612989391e-06, + "loss": 0.13546934127807617, + "step": 4800 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6398928370952863e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4800/training_args.bin b/checkpoint-4800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-4900/README.md b/checkpoint-4900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-4900/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-4900/adapter_config.json b/checkpoint-4900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-4900/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-4900/adapter_model.safetensors b/checkpoint-4900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7d51edf4626c3b55837775311a22fd372effdbae --- /dev/null +++ b/checkpoint-4900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f0911a72c2009ffcdaa13711a34bf91ac4328965a15f81fbe2cd8119715038 +size 169741912 diff --git a/checkpoint-4900/chat_template.jinja b/checkpoint-4900/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-4900/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-4900/optimizer.pt b/checkpoint-4900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6acbd0ce8f094f3a6f20f6507bea363e430f6cd4 --- /dev/null +++ b/checkpoint-4900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b71bcb4660a49d3ac1b098d9d6628096a0792a596b5545fa1d36cf13c890e6 +size 72807355 diff --git a/checkpoint-4900/processor_config.json b/checkpoint-4900/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-4900/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-4900/rng_state.pth b/checkpoint-4900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-4900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-4900/scheduler.pt b/checkpoint-4900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b449a85b5e3a28974a9a9d73bbca9d0b916c4c1a --- /dev/null +++ b/checkpoint-4900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94fba794993de2c072fab349e1124df8846a79fce1e519060425ea6e08ae19a7 +size 1465 diff --git a/checkpoint-4900/tokenizer.json b/checkpoint-4900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-4900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-4900/tokenizer_config.json b/checkpoint-4900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-4900/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-4900/trainer_state.json b/checkpoint-4900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6bce90e59bb5de169f031c8e1893176287a19433 --- /dev/null +++ b/checkpoint-4900/trainer_state.json @@ -0,0 +1,6902 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8915574963609898, + "eval_steps": 100, + "global_step": 4900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + }, + { + "epoch": 0.8560771470160117, + "grad_norm": 0.17895208299160004, + "learning_rate": 2.6965019219562155e-06, + "loss": 0.13971571922302245, + "step": 4705 + }, + { + "epoch": 0.8569868995633187, + "grad_norm": 0.22735828161239624, + "learning_rate": 2.6633455210614055e-06, + "loss": 0.13776102066040039, + "step": 4710 + }, + { + "epoch": 0.8578966521106259, + "grad_norm": 0.16779793798923492, + "learning_rate": 2.630382757333133e-06, + "loss": 0.14134042263031005, + "step": 4715 + }, + { + "epoch": 0.858806404657933, + "grad_norm": 0.2148888260126114, + "learning_rate": 2.597613916526637e-06, + "loss": 0.14680721759796142, + "step": 4720 + }, + { + "epoch": 0.8597161572052402, + "grad_norm": 0.16560257971286774, + "learning_rate": 2.565039282716045e-06, + "loss": 0.14137234687805175, + "step": 4725 + }, + { + "epoch": 0.8606259097525473, + "grad_norm": 0.16197068989276886, + "learning_rate": 2.532659138291879e-06, + "loss": 0.14969314336776735, + "step": 4730 + }, + { + "epoch": 0.8615356622998545, + "grad_norm": 0.14650246500968933, + "learning_rate": 2.5004737639586497e-06, + "loss": 0.13532910346984864, + "step": 4735 + }, + { + "epoch": 0.8624454148471615, + "grad_norm": 0.1565634310245514, + "learning_rate": 2.4684834387323943e-06, + "loss": 0.14146244525909424, + "step": 4740 + }, + { + "epoch": 0.8633551673944687, + "grad_norm": 0.18060864508152008, + "learning_rate": 2.4366884399382393e-06, + "loss": 0.14218534231185914, + "step": 4745 + }, + { + "epoch": 0.8642649199417758, + "grad_norm": 0.24613255262374878, + "learning_rate": 2.4050890432080557e-06, + "loss": 0.13907679319381713, + "step": 4750 + }, + { + "epoch": 0.865174672489083, + "grad_norm": 0.16036023199558258, + "learning_rate": 2.3736855224780057e-06, + "loss": 0.13718113899230958, + "step": 4755 + }, + { + "epoch": 0.86608442503639, + "grad_norm": 0.16678516566753387, + "learning_rate": 2.3424781499862075e-06, + "loss": 0.1327962040901184, + "step": 4760 + }, + { + "epoch": 0.8669941775836972, + "grad_norm": 0.1763770878314972, + "learning_rate": 2.3114671962703727e-06, + "loss": 0.14390318393707274, + "step": 4765 + }, + { + "epoch": 0.8679039301310044, + "grad_norm": 0.17735697329044342, + "learning_rate": 2.280652930165428e-06, + "loss": 0.15223288536071777, + "step": 4770 + }, + { + "epoch": 0.8688136826783115, + "grad_norm": 0.15827041864395142, + "learning_rate": 2.250035618801241e-06, + "loss": 0.14296332597732545, + "step": 4775 + }, + { + "epoch": 0.8697234352256187, + "grad_norm": 0.16876135766506195, + "learning_rate": 2.219615527600244e-06, + "loss": 0.1359076738357544, + "step": 4780 + }, + { + "epoch": 0.8706331877729258, + "grad_norm": 0.1800110638141632, + "learning_rate": 2.189392920275174e-06, + "loss": 0.1424281358718872, + "step": 4785 + }, + { + "epoch": 0.8715429403202329, + "grad_norm": 0.1409560889005661, + "learning_rate": 2.159368058826783e-06, + "loss": 0.14480490684509278, + "step": 4790 + }, + { + "epoch": 0.87245269286754, + "grad_norm": 0.1634288728237152, + "learning_rate": 2.129541203541535e-06, + "loss": 0.14513269662857056, + "step": 4795 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.17126062512397766, + "learning_rate": 2.099912612989391e-06, + "loss": 0.13546934127807617, + "step": 4800 + }, + { + "epoch": 0.8742721979621543, + "grad_norm": 0.16704080998897552, + "learning_rate": 2.0704825440215457e-06, + "loss": 0.13852492570877076, + "step": 4805 + }, + { + "epoch": 0.8751819505094615, + "grad_norm": 0.1725970208644867, + "learning_rate": 2.0412512517681946e-06, + "loss": 0.14504197835922242, + "step": 4810 + }, + { + "epoch": 0.8760917030567685, + "grad_norm": 0.1700201779603958, + "learning_rate": 2.0122189896363387e-06, + "loss": 0.14312338829040527, + "step": 4815 + }, + { + "epoch": 0.8770014556040757, + "grad_norm": 0.16491736471652985, + "learning_rate": 1.9833860093075834e-06, + "loss": 0.14062976837158203, + "step": 4820 + }, + { + "epoch": 0.8779112081513828, + "grad_norm": 0.13748787343502045, + "learning_rate": 1.9547525607359537e-06, + "loss": 0.1346171498298645, + "step": 4825 + }, + { + "epoch": 0.87882096069869, + "grad_norm": 0.16399399936199188, + "learning_rate": 1.926318892145712e-06, + "loss": 0.14178123474121093, + "step": 4830 + }, + { + "epoch": 0.879730713245997, + "grad_norm": 0.14491963386535645, + "learning_rate": 1.8980852500292412e-06, + "loss": 0.1408564567565918, + "step": 4835 + }, + { + "epoch": 0.8806404657933042, + "grad_norm": 0.17335423827171326, + "learning_rate": 1.8700518791448851e-06, + "loss": 0.14403265714645386, + "step": 4840 + }, + { + "epoch": 0.8815502183406113, + "grad_norm": 0.17399625480175018, + "learning_rate": 1.8422190225148155e-06, + "loss": 0.14289036989212037, + "step": 4845 + }, + { + "epoch": 0.8824599708879185, + "grad_norm": 0.17945612967014313, + "learning_rate": 1.814586921422956e-06, + "loss": 0.14494109153747559, + "step": 4850 + }, + { + "epoch": 0.8833697234352256, + "grad_norm": 0.1910620480775833, + "learning_rate": 1.7871558154128664e-06, + "loss": 0.13726245164871215, + "step": 4855 + }, + { + "epoch": 0.8842794759825328, + "grad_norm": 0.1771879345178604, + "learning_rate": 1.7599259422856756e-06, + "loss": 0.1464752197265625, + "step": 4860 + }, + { + "epoch": 0.8851892285298398, + "grad_norm": 0.19427461922168732, + "learning_rate": 1.7328975380980218e-06, + "loss": 0.13823356628417968, + "step": 4865 + }, + { + "epoch": 0.886098981077147, + "grad_norm": 0.1491149365901947, + "learning_rate": 1.7060708371599897e-06, + "loss": 0.1338604211807251, + "step": 4870 + }, + { + "epoch": 0.8870087336244541, + "grad_norm": 0.16087733209133148, + "learning_rate": 1.6794460720331057e-06, + "loss": 0.14184389114379883, + "step": 4875 + }, + { + "epoch": 0.8879184861717613, + "grad_norm": 0.14506325125694275, + "learning_rate": 1.653023473528309e-06, + "loss": 0.14267687797546386, + "step": 4880 + }, + { + "epoch": 0.8888282387190685, + "grad_norm": 0.16886365413665771, + "learning_rate": 1.626803270703936e-06, + "loss": 0.14266083240509034, + "step": 4885 + }, + { + "epoch": 0.8897379912663755, + "grad_norm": 0.1891999989748001, + "learning_rate": 1.6007856908637652e-06, + "loss": 0.1398016929626465, + "step": 4890 + }, + { + "epoch": 0.8906477438136827, + "grad_norm": 0.17645299434661865, + "learning_rate": 1.5749709595550083e-06, + "loss": 0.13869571685791016, + "step": 4895 + }, + { + "epoch": 0.8915574963609898, + "grad_norm": 0.17714262008666992, + "learning_rate": 1.549359300566408e-06, + "loss": 0.14957486391067504, + "step": 4900 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.694709851825631e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-4900/training_args.bin b/checkpoint-4900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-4900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-500/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..57f073f55be4d5b6476317a2dd5c9d7c012c731e --- /dev/null +++ b/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e72da5e72729a82ed1e74cb4530ea283258fbc2e6a672f1720f78e35833fc693 +size 169741912 diff --git a/checkpoint-500/chat_template.jinja b/checkpoint-500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-500/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1c23368da91ccb321cdb11a33822f5ce9681e88 --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57690ba6db6b1c1317be364e116f785706400e214fc68f6bac0f5f05a58f044a +size 72807355 diff --git a/checkpoint-500/processor_config.json b/checkpoint-500/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-500/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dedb3acf3e322ab265844313e4843a1c87c685f5 --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c96ac0a9f378033255ff6badf107ce844a8d4aa8acf7d0f966846c207b52eaef +size 1465 diff --git a/checkpoint-500/tokenizer.json b/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-500/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..84a830d845225c501fec2be7ed3ca813c30edc50 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,742 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.09097525473071325, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8430480830875264e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-5000/README.md b/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-5000/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-5000/adapter_config.json b/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-5000/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5000/adapter_model.safetensors b/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..49e258125bc310d7c820342ac761d1fe8b70b818 --- /dev/null +++ b/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062304accc1e0e63859762076339d9b10c027abb482653c443866239fa7d30c9 +size 169741912 diff --git a/checkpoint-5000/chat_template.jinja b/checkpoint-5000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-5000/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-5000/optimizer.pt b/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a970be0b82ebb489cb30cebae9385376466a3534 --- /dev/null +++ b/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf925baf611c389d14cd98bfb1ec7673cb616dc7356b485848d57d356399989a +size 72807355 diff --git a/checkpoint-5000/processor_config.json b/checkpoint-5000/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-5000/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-5000/rng_state.pth b/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-5000/scheduler.pt b/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f0660753bd7e780deec81470e1025d35fa4b1dd8 --- /dev/null +++ b/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc5e6b6ccc6bc308c743e552cb17737cf764433750ecaa57b00cf76b0e4a1c85 +size 1465 diff --git a/checkpoint-5000/tokenizer.json b/checkpoint-5000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-5000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-5000/tokenizer_config.json b/checkpoint-5000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-5000/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-5000/trainer_state.json b/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9e750a5fad76573f641e0468ee9006c9af08f7ba --- /dev/null +++ b/checkpoint-5000/trainer_state.json @@ -0,0 +1,7042 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9097525473071325, + "eval_steps": 100, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + }, + { + "epoch": 0.8560771470160117, + "grad_norm": 0.17895208299160004, + "learning_rate": 2.6965019219562155e-06, + "loss": 0.13971571922302245, + "step": 4705 + }, + { + "epoch": 0.8569868995633187, + "grad_norm": 0.22735828161239624, + "learning_rate": 2.6633455210614055e-06, + "loss": 0.13776102066040039, + "step": 4710 + }, + { + "epoch": 0.8578966521106259, + "grad_norm": 0.16779793798923492, + "learning_rate": 2.630382757333133e-06, + "loss": 0.14134042263031005, + "step": 4715 + }, + { + "epoch": 0.858806404657933, + "grad_norm": 0.2148888260126114, + "learning_rate": 2.597613916526637e-06, + "loss": 0.14680721759796142, + "step": 4720 + }, + { + "epoch": 0.8597161572052402, + "grad_norm": 0.16560257971286774, + "learning_rate": 2.565039282716045e-06, + "loss": 0.14137234687805175, + "step": 4725 + }, + { + "epoch": 0.8606259097525473, + "grad_norm": 0.16197068989276886, + "learning_rate": 2.532659138291879e-06, + "loss": 0.14969314336776735, + "step": 4730 + }, + { + "epoch": 0.8615356622998545, + "grad_norm": 0.14650246500968933, + "learning_rate": 2.5004737639586497e-06, + "loss": 0.13532910346984864, + "step": 4735 + }, + { + "epoch": 0.8624454148471615, + "grad_norm": 0.1565634310245514, + "learning_rate": 2.4684834387323943e-06, + "loss": 0.14146244525909424, + "step": 4740 + }, + { + "epoch": 0.8633551673944687, + "grad_norm": 0.18060864508152008, + "learning_rate": 2.4366884399382393e-06, + "loss": 0.14218534231185914, + "step": 4745 + }, + { + "epoch": 0.8642649199417758, + "grad_norm": 0.24613255262374878, + "learning_rate": 2.4050890432080557e-06, + "loss": 0.13907679319381713, + "step": 4750 + }, + { + "epoch": 0.865174672489083, + "grad_norm": 0.16036023199558258, + "learning_rate": 2.3736855224780057e-06, + "loss": 0.13718113899230958, + "step": 4755 + }, + { + "epoch": 0.86608442503639, + "grad_norm": 0.16678516566753387, + "learning_rate": 2.3424781499862075e-06, + "loss": 0.1327962040901184, + "step": 4760 + }, + { + "epoch": 0.8669941775836972, + "grad_norm": 0.1763770878314972, + "learning_rate": 2.3114671962703727e-06, + "loss": 0.14390318393707274, + "step": 4765 + }, + { + "epoch": 0.8679039301310044, + "grad_norm": 0.17735697329044342, + "learning_rate": 2.280652930165428e-06, + "loss": 0.15223288536071777, + "step": 4770 + }, + { + "epoch": 0.8688136826783115, + "grad_norm": 0.15827041864395142, + "learning_rate": 2.250035618801241e-06, + "loss": 0.14296332597732545, + "step": 4775 + }, + { + "epoch": 0.8697234352256187, + "grad_norm": 0.16876135766506195, + "learning_rate": 2.219615527600244e-06, + "loss": 0.1359076738357544, + "step": 4780 + }, + { + "epoch": 0.8706331877729258, + "grad_norm": 0.1800110638141632, + "learning_rate": 2.189392920275174e-06, + "loss": 0.1424281358718872, + "step": 4785 + }, + { + "epoch": 0.8715429403202329, + "grad_norm": 0.1409560889005661, + "learning_rate": 2.159368058826783e-06, + "loss": 0.14480490684509278, + "step": 4790 + }, + { + "epoch": 0.87245269286754, + "grad_norm": 0.1634288728237152, + "learning_rate": 2.129541203541535e-06, + "loss": 0.14513269662857056, + "step": 4795 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.17126062512397766, + "learning_rate": 2.099912612989391e-06, + "loss": 0.13546934127807617, + "step": 4800 + }, + { + "epoch": 0.8742721979621543, + "grad_norm": 0.16704080998897552, + "learning_rate": 2.0704825440215457e-06, + "loss": 0.13852492570877076, + "step": 4805 + }, + { + "epoch": 0.8751819505094615, + "grad_norm": 0.1725970208644867, + "learning_rate": 2.0412512517681946e-06, + "loss": 0.14504197835922242, + "step": 4810 + }, + { + "epoch": 0.8760917030567685, + "grad_norm": 0.1700201779603958, + "learning_rate": 2.0122189896363387e-06, + "loss": 0.14312338829040527, + "step": 4815 + }, + { + "epoch": 0.8770014556040757, + "grad_norm": 0.16491736471652985, + "learning_rate": 1.9833860093075834e-06, + "loss": 0.14062976837158203, + "step": 4820 + }, + { + "epoch": 0.8779112081513828, + "grad_norm": 0.13748787343502045, + "learning_rate": 1.9547525607359537e-06, + "loss": 0.1346171498298645, + "step": 4825 + }, + { + "epoch": 0.87882096069869, + "grad_norm": 0.16399399936199188, + "learning_rate": 1.926318892145712e-06, + "loss": 0.14178123474121093, + "step": 4830 + }, + { + "epoch": 0.879730713245997, + "grad_norm": 0.14491963386535645, + "learning_rate": 1.8980852500292412e-06, + "loss": 0.1408564567565918, + "step": 4835 + }, + { + "epoch": 0.8806404657933042, + "grad_norm": 0.17335423827171326, + "learning_rate": 1.8700518791448851e-06, + "loss": 0.14403265714645386, + "step": 4840 + }, + { + "epoch": 0.8815502183406113, + "grad_norm": 0.17399625480175018, + "learning_rate": 1.8422190225148155e-06, + "loss": 0.14289036989212037, + "step": 4845 + }, + { + "epoch": 0.8824599708879185, + "grad_norm": 0.17945612967014313, + "learning_rate": 1.814586921422956e-06, + "loss": 0.14494109153747559, + "step": 4850 + }, + { + "epoch": 0.8833697234352256, + "grad_norm": 0.1910620480775833, + "learning_rate": 1.7871558154128664e-06, + "loss": 0.13726245164871215, + "step": 4855 + }, + { + "epoch": 0.8842794759825328, + "grad_norm": 0.1771879345178604, + "learning_rate": 1.7599259422856756e-06, + "loss": 0.1464752197265625, + "step": 4860 + }, + { + "epoch": 0.8851892285298398, + "grad_norm": 0.19427461922168732, + "learning_rate": 1.7328975380980218e-06, + "loss": 0.13823356628417968, + "step": 4865 + }, + { + "epoch": 0.886098981077147, + "grad_norm": 0.1491149365901947, + "learning_rate": 1.7060708371599897e-06, + "loss": 0.1338604211807251, + "step": 4870 + }, + { + "epoch": 0.8870087336244541, + "grad_norm": 0.16087733209133148, + "learning_rate": 1.6794460720331057e-06, + "loss": 0.14184389114379883, + "step": 4875 + }, + { + "epoch": 0.8879184861717613, + "grad_norm": 0.14506325125694275, + "learning_rate": 1.653023473528309e-06, + "loss": 0.14267687797546386, + "step": 4880 + }, + { + "epoch": 0.8888282387190685, + "grad_norm": 0.16886365413665771, + "learning_rate": 1.626803270703936e-06, + "loss": 0.14266083240509034, + "step": 4885 + }, + { + "epoch": 0.8897379912663755, + "grad_norm": 0.1891999989748001, + "learning_rate": 1.6007856908637652e-06, + "loss": 0.1398016929626465, + "step": 4890 + }, + { + "epoch": 0.8906477438136827, + "grad_norm": 0.17645299434661865, + "learning_rate": 1.5749709595550083e-06, + "loss": 0.13869571685791016, + "step": 4895 + }, + { + "epoch": 0.8915574963609898, + "grad_norm": 0.17714262008666992, + "learning_rate": 1.549359300566408e-06, + "loss": 0.14957486391067504, + "step": 4900 + }, + { + "epoch": 0.892467248908297, + "grad_norm": 0.18025240302085876, + "learning_rate": 1.5239509359262355e-06, + "loss": 0.1358652949333191, + "step": 4905 + }, + { + "epoch": 0.8933770014556041, + "grad_norm": 0.17539937794208527, + "learning_rate": 1.4987460859004154e-06, + "loss": 0.13833394050598144, + "step": 4910 + }, + { + "epoch": 0.8942867540029112, + "grad_norm": 0.1772230565547943, + "learning_rate": 1.4737449689905953e-06, + "loss": 0.14202116727828978, + "step": 4915 + }, + { + "epoch": 0.8951965065502183, + "grad_norm": 0.1670161783695221, + "learning_rate": 1.4489478019322433e-06, + "loss": 0.1403665542602539, + "step": 4920 + }, + { + "epoch": 0.8961062590975255, + "grad_norm": 0.1697034239768982, + "learning_rate": 1.4243547996927926e-06, + "loss": 0.1401481032371521, + "step": 4925 + }, + { + "epoch": 0.8970160116448326, + "grad_norm": 0.16474860906600952, + "learning_rate": 1.3999661754697636e-06, + "loss": 0.13969850540161133, + "step": 4930 + }, + { + "epoch": 0.8979257641921398, + "grad_norm": 0.1664883941411972, + "learning_rate": 1.3757821406889027e-06, + "loss": 0.1399069309234619, + "step": 4935 + }, + { + "epoch": 0.8988355167394468, + "grad_norm": 0.16675794124603271, + "learning_rate": 1.351802905002386e-06, + "loss": 0.14129226207733153, + "step": 4940 + }, + { + "epoch": 0.899745269286754, + "grad_norm": 0.17529809474945068, + "learning_rate": 1.3280286762869632e-06, + "loss": 0.14663081169128417, + "step": 4945 + }, + { + "epoch": 0.9006550218340611, + "grad_norm": 0.17758169770240784, + "learning_rate": 1.3044596606421795e-06, + "loss": 0.13986254930496217, + "step": 4950 + }, + { + "epoch": 0.9015647743813683, + "grad_norm": 0.153225839138031, + "learning_rate": 1.2810960623885815e-06, + "loss": 0.14236698150634766, + "step": 4955 + }, + { + "epoch": 0.9024745269286754, + "grad_norm": 0.169761523604393, + "learning_rate": 1.2579380840659376e-06, + "loss": 0.1450445055961609, + "step": 4960 + }, + { + "epoch": 0.9033842794759825, + "grad_norm": 0.16659331321716309, + "learning_rate": 1.2349859264315034e-06, + "loss": 0.14043926000595092, + "step": 4965 + }, + { + "epoch": 0.9042940320232896, + "grad_norm": 0.16748706996440887, + "learning_rate": 1.2122397884582553e-06, + "loss": 0.14725675582885742, + "step": 4970 + }, + { + "epoch": 0.9052037845705968, + "grad_norm": 0.1600511223077774, + "learning_rate": 1.1896998673331883e-06, + "loss": 0.14551150798797607, + "step": 4975 + }, + { + "epoch": 0.9061135371179039, + "grad_norm": 0.24318362772464752, + "learning_rate": 1.1673663584555934e-06, + "loss": 0.14470888376235963, + "step": 4980 + }, + { + "epoch": 0.9070232896652111, + "grad_norm": 0.16443821787834167, + "learning_rate": 1.1452394554353706e-06, + "loss": 0.13639854192733764, + "step": 4985 + }, + { + "epoch": 0.9079330422125182, + "grad_norm": 0.14277774095535278, + "learning_rate": 1.1233193500913453e-06, + "loss": 0.13749881982803344, + "step": 4990 + }, + { + "epoch": 0.9088427947598253, + "grad_norm": 0.1610947549343109, + "learning_rate": 1.1016062324496008e-06, + "loss": 0.1385629653930664, + "step": 4995 + }, + { + "epoch": 0.9097525473071325, + "grad_norm": 0.17888498306274414, + "learning_rate": 1.080100290741845e-06, + "loss": 0.14225621223449708, + "step": 5000 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.748884707820236e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5000/training_args.bin b/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-5100/README.md b/checkpoint-5100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-5100/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-5100/adapter_config.json b/checkpoint-5100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-5100/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5100/adapter_model.safetensors b/checkpoint-5100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..991125bd90263b3b1244068c71f4d49ed24caf68 --- /dev/null +++ b/checkpoint-5100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85e9de4c60daac973c28ff33d1ea1bc93b4f25c4d855c1ef80e60071f02cc7a5 +size 169741912 diff --git a/checkpoint-5100/chat_template.jinja b/checkpoint-5100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-5100/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-5100/optimizer.pt b/checkpoint-5100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..08a0964caa41269d32ba88e142ecca6ae873e12a --- /dev/null +++ b/checkpoint-5100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e155e4b28d9304fdc51c12782756f07045b4a1fd0e13dabdde7e825ff164cc2a +size 72807355 diff --git a/checkpoint-5100/processor_config.json b/checkpoint-5100/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-5100/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-5100/rng_state.pth b/checkpoint-5100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-5100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-5100/scheduler.pt b/checkpoint-5100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5872ecc7886691192f2f521f170cd8ab806e293e --- /dev/null +++ b/checkpoint-5100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70a8a41be46fdfaa0c5bde2c2de38bd9f972e5f87edea20a2693858227a5b2d +size 1465 diff --git a/checkpoint-5100/tokenizer.json b/checkpoint-5100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-5100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-5100/tokenizer_config.json b/checkpoint-5100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-5100/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-5100/trainer_state.json b/checkpoint-5100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a8d3b488292433ce297a3993030e8f69cb21800d --- /dev/null +++ b/checkpoint-5100/trainer_state.json @@ -0,0 +1,7182 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9279475982532751, + "eval_steps": 100, + "global_step": 5100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + }, + { + "epoch": 0.8560771470160117, + "grad_norm": 0.17895208299160004, + "learning_rate": 2.6965019219562155e-06, + "loss": 0.13971571922302245, + "step": 4705 + }, + { + "epoch": 0.8569868995633187, + "grad_norm": 0.22735828161239624, + "learning_rate": 2.6633455210614055e-06, + "loss": 0.13776102066040039, + "step": 4710 + }, + { + "epoch": 0.8578966521106259, + "grad_norm": 0.16779793798923492, + "learning_rate": 2.630382757333133e-06, + "loss": 0.14134042263031005, + "step": 4715 + }, + { + "epoch": 0.858806404657933, + "grad_norm": 0.2148888260126114, + "learning_rate": 2.597613916526637e-06, + "loss": 0.14680721759796142, + "step": 4720 + }, + { + "epoch": 0.8597161572052402, + "grad_norm": 0.16560257971286774, + "learning_rate": 2.565039282716045e-06, + "loss": 0.14137234687805175, + "step": 4725 + }, + { + "epoch": 0.8606259097525473, + "grad_norm": 0.16197068989276886, + "learning_rate": 2.532659138291879e-06, + "loss": 0.14969314336776735, + "step": 4730 + }, + { + "epoch": 0.8615356622998545, + "grad_norm": 0.14650246500968933, + "learning_rate": 2.5004737639586497e-06, + "loss": 0.13532910346984864, + "step": 4735 + }, + { + "epoch": 0.8624454148471615, + "grad_norm": 0.1565634310245514, + "learning_rate": 2.4684834387323943e-06, + "loss": 0.14146244525909424, + "step": 4740 + }, + { + "epoch": 0.8633551673944687, + "grad_norm": 0.18060864508152008, + "learning_rate": 2.4366884399382393e-06, + "loss": 0.14218534231185914, + "step": 4745 + }, + { + "epoch": 0.8642649199417758, + "grad_norm": 0.24613255262374878, + "learning_rate": 2.4050890432080557e-06, + "loss": 0.13907679319381713, + "step": 4750 + }, + { + "epoch": 0.865174672489083, + "grad_norm": 0.16036023199558258, + "learning_rate": 2.3736855224780057e-06, + "loss": 0.13718113899230958, + "step": 4755 + }, + { + "epoch": 0.86608442503639, + "grad_norm": 0.16678516566753387, + "learning_rate": 2.3424781499862075e-06, + "loss": 0.1327962040901184, + "step": 4760 + }, + { + "epoch": 0.8669941775836972, + "grad_norm": 0.1763770878314972, + "learning_rate": 2.3114671962703727e-06, + "loss": 0.14390318393707274, + "step": 4765 + }, + { + "epoch": 0.8679039301310044, + "grad_norm": 0.17735697329044342, + "learning_rate": 2.280652930165428e-06, + "loss": 0.15223288536071777, + "step": 4770 + }, + { + "epoch": 0.8688136826783115, + "grad_norm": 0.15827041864395142, + "learning_rate": 2.250035618801241e-06, + "loss": 0.14296332597732545, + "step": 4775 + }, + { + "epoch": 0.8697234352256187, + "grad_norm": 0.16876135766506195, + "learning_rate": 2.219615527600244e-06, + "loss": 0.1359076738357544, + "step": 4780 + }, + { + "epoch": 0.8706331877729258, + "grad_norm": 0.1800110638141632, + "learning_rate": 2.189392920275174e-06, + "loss": 0.1424281358718872, + "step": 4785 + }, + { + "epoch": 0.8715429403202329, + "grad_norm": 0.1409560889005661, + "learning_rate": 2.159368058826783e-06, + "loss": 0.14480490684509278, + "step": 4790 + }, + { + "epoch": 0.87245269286754, + "grad_norm": 0.1634288728237152, + "learning_rate": 2.129541203541535e-06, + "loss": 0.14513269662857056, + "step": 4795 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.17126062512397766, + "learning_rate": 2.099912612989391e-06, + "loss": 0.13546934127807617, + "step": 4800 + }, + { + "epoch": 0.8742721979621543, + "grad_norm": 0.16704080998897552, + "learning_rate": 2.0704825440215457e-06, + "loss": 0.13852492570877076, + "step": 4805 + }, + { + "epoch": 0.8751819505094615, + "grad_norm": 0.1725970208644867, + "learning_rate": 2.0412512517681946e-06, + "loss": 0.14504197835922242, + "step": 4810 + }, + { + "epoch": 0.8760917030567685, + "grad_norm": 0.1700201779603958, + "learning_rate": 2.0122189896363387e-06, + "loss": 0.14312338829040527, + "step": 4815 + }, + { + "epoch": 0.8770014556040757, + "grad_norm": 0.16491736471652985, + "learning_rate": 1.9833860093075834e-06, + "loss": 0.14062976837158203, + "step": 4820 + }, + { + "epoch": 0.8779112081513828, + "grad_norm": 0.13748787343502045, + "learning_rate": 1.9547525607359537e-06, + "loss": 0.1346171498298645, + "step": 4825 + }, + { + "epoch": 0.87882096069869, + "grad_norm": 0.16399399936199188, + "learning_rate": 1.926318892145712e-06, + "loss": 0.14178123474121093, + "step": 4830 + }, + { + "epoch": 0.879730713245997, + "grad_norm": 0.14491963386535645, + "learning_rate": 1.8980852500292412e-06, + "loss": 0.1408564567565918, + "step": 4835 + }, + { + "epoch": 0.8806404657933042, + "grad_norm": 0.17335423827171326, + "learning_rate": 1.8700518791448851e-06, + "loss": 0.14403265714645386, + "step": 4840 + }, + { + "epoch": 0.8815502183406113, + "grad_norm": 0.17399625480175018, + "learning_rate": 1.8422190225148155e-06, + "loss": 0.14289036989212037, + "step": 4845 + }, + { + "epoch": 0.8824599708879185, + "grad_norm": 0.17945612967014313, + "learning_rate": 1.814586921422956e-06, + "loss": 0.14494109153747559, + "step": 4850 + }, + { + "epoch": 0.8833697234352256, + "grad_norm": 0.1910620480775833, + "learning_rate": 1.7871558154128664e-06, + "loss": 0.13726245164871215, + "step": 4855 + }, + { + "epoch": 0.8842794759825328, + "grad_norm": 0.1771879345178604, + "learning_rate": 1.7599259422856756e-06, + "loss": 0.1464752197265625, + "step": 4860 + }, + { + "epoch": 0.8851892285298398, + "grad_norm": 0.19427461922168732, + "learning_rate": 1.7328975380980218e-06, + "loss": 0.13823356628417968, + "step": 4865 + }, + { + "epoch": 0.886098981077147, + "grad_norm": 0.1491149365901947, + "learning_rate": 1.7060708371599897e-06, + "loss": 0.1338604211807251, + "step": 4870 + }, + { + "epoch": 0.8870087336244541, + "grad_norm": 0.16087733209133148, + "learning_rate": 1.6794460720331057e-06, + "loss": 0.14184389114379883, + "step": 4875 + }, + { + "epoch": 0.8879184861717613, + "grad_norm": 0.14506325125694275, + "learning_rate": 1.653023473528309e-06, + "loss": 0.14267687797546386, + "step": 4880 + }, + { + "epoch": 0.8888282387190685, + "grad_norm": 0.16886365413665771, + "learning_rate": 1.626803270703936e-06, + "loss": 0.14266083240509034, + "step": 4885 + }, + { + "epoch": 0.8897379912663755, + "grad_norm": 0.1891999989748001, + "learning_rate": 1.6007856908637652e-06, + "loss": 0.1398016929626465, + "step": 4890 + }, + { + "epoch": 0.8906477438136827, + "grad_norm": 0.17645299434661865, + "learning_rate": 1.5749709595550083e-06, + "loss": 0.13869571685791016, + "step": 4895 + }, + { + "epoch": 0.8915574963609898, + "grad_norm": 0.17714262008666992, + "learning_rate": 1.549359300566408e-06, + "loss": 0.14957486391067504, + "step": 4900 + }, + { + "epoch": 0.892467248908297, + "grad_norm": 0.18025240302085876, + "learning_rate": 1.5239509359262355e-06, + "loss": 0.1358652949333191, + "step": 4905 + }, + { + "epoch": 0.8933770014556041, + "grad_norm": 0.17539937794208527, + "learning_rate": 1.4987460859004154e-06, + "loss": 0.13833394050598144, + "step": 4910 + }, + { + "epoch": 0.8942867540029112, + "grad_norm": 0.1772230565547943, + "learning_rate": 1.4737449689905953e-06, + "loss": 0.14202116727828978, + "step": 4915 + }, + { + "epoch": 0.8951965065502183, + "grad_norm": 0.1670161783695221, + "learning_rate": 1.4489478019322433e-06, + "loss": 0.1403665542602539, + "step": 4920 + }, + { + "epoch": 0.8961062590975255, + "grad_norm": 0.1697034239768982, + "learning_rate": 1.4243547996927926e-06, + "loss": 0.1401481032371521, + "step": 4925 + }, + { + "epoch": 0.8970160116448326, + "grad_norm": 0.16474860906600952, + "learning_rate": 1.3999661754697636e-06, + "loss": 0.13969850540161133, + "step": 4930 + }, + { + "epoch": 0.8979257641921398, + "grad_norm": 0.1664883941411972, + "learning_rate": 1.3757821406889027e-06, + "loss": 0.1399069309234619, + "step": 4935 + }, + { + "epoch": 0.8988355167394468, + "grad_norm": 0.16675794124603271, + "learning_rate": 1.351802905002386e-06, + "loss": 0.14129226207733153, + "step": 4940 + }, + { + "epoch": 0.899745269286754, + "grad_norm": 0.17529809474945068, + "learning_rate": 1.3280286762869632e-06, + "loss": 0.14663081169128417, + "step": 4945 + }, + { + "epoch": 0.9006550218340611, + "grad_norm": 0.17758169770240784, + "learning_rate": 1.3044596606421795e-06, + "loss": 0.13986254930496217, + "step": 4950 + }, + { + "epoch": 0.9015647743813683, + "grad_norm": 0.153225839138031, + "learning_rate": 1.2810960623885815e-06, + "loss": 0.14236698150634766, + "step": 4955 + }, + { + "epoch": 0.9024745269286754, + "grad_norm": 0.169761523604393, + "learning_rate": 1.2579380840659376e-06, + "loss": 0.1450445055961609, + "step": 4960 + }, + { + "epoch": 0.9033842794759825, + "grad_norm": 0.16659331321716309, + "learning_rate": 1.2349859264315034e-06, + "loss": 0.14043926000595092, + "step": 4965 + }, + { + "epoch": 0.9042940320232896, + "grad_norm": 0.16748706996440887, + "learning_rate": 1.2122397884582553e-06, + "loss": 0.14725675582885742, + "step": 4970 + }, + { + "epoch": 0.9052037845705968, + "grad_norm": 0.1600511223077774, + "learning_rate": 1.1896998673331883e-06, + "loss": 0.14551150798797607, + "step": 4975 + }, + { + "epoch": 0.9061135371179039, + "grad_norm": 0.24318362772464752, + "learning_rate": 1.1673663584555934e-06, + "loss": 0.14470888376235963, + "step": 4980 + }, + { + "epoch": 0.9070232896652111, + "grad_norm": 0.16443821787834167, + "learning_rate": 1.1452394554353706e-06, + "loss": 0.13639854192733764, + "step": 4985 + }, + { + "epoch": 0.9079330422125182, + "grad_norm": 0.14277774095535278, + "learning_rate": 1.1233193500913453e-06, + "loss": 0.13749881982803344, + "step": 4990 + }, + { + "epoch": 0.9088427947598253, + "grad_norm": 0.1610947549343109, + "learning_rate": 1.1016062324496008e-06, + "loss": 0.1385629653930664, + "step": 4995 + }, + { + "epoch": 0.9097525473071325, + "grad_norm": 0.17888498306274414, + "learning_rate": 1.080100290741845e-06, + "loss": 0.14225621223449708, + "step": 5000 + }, + { + "epoch": 0.9106622998544396, + "grad_norm": 0.17488449811935425, + "learning_rate": 1.0588017114037729e-06, + "loss": 0.14187805652618407, + "step": 5005 + }, + { + "epoch": 0.9115720524017468, + "grad_norm": 0.16410665214061737, + "learning_rate": 1.0377106790734392e-06, + "loss": 0.1407416582107544, + "step": 5010 + }, + { + "epoch": 0.9124818049490538, + "grad_norm": 0.18115971982479095, + "learning_rate": 1.016827376589674e-06, + "loss": 0.1427263855934143, + "step": 5015 + }, + { + "epoch": 0.913391557496361, + "grad_norm": 0.18507841229438782, + "learning_rate": 9.961519849904898e-07, + "loss": 0.1390499472618103, + "step": 5020 + }, + { + "epoch": 0.9143013100436681, + "grad_norm": 0.21296796202659607, + "learning_rate": 9.75684683511513e-07, + "loss": 0.1382216691970825, + "step": 5025 + }, + { + "epoch": 0.9152110625909753, + "grad_norm": 0.2308044582605362, + "learning_rate": 9.55425649584435e-07, + "loss": 0.14271280765533448, + "step": 5030 + }, + { + "epoch": 0.9161208151382824, + "grad_norm": 0.15796682238578796, + "learning_rate": 9.353750588354527e-07, + "loss": 0.13807624578475952, + "step": 5035 + }, + { + "epoch": 0.9170305676855895, + "grad_norm": 0.1695316582918167, + "learning_rate": 9.155330850837834e-07, + "loss": 0.14289476871490478, + "step": 5040 + }, + { + "epoch": 0.9179403202328966, + "grad_norm": 0.1738404780626297, + "learning_rate": 8.958999003401191e-07, + "loss": 0.14070619344711305, + "step": 5045 + }, + { + "epoch": 0.9188500727802038, + "grad_norm": 0.20618964731693268, + "learning_rate": 8.764756748051662e-07, + "loss": 0.14535053968429565, + "step": 5050 + }, + { + "epoch": 0.9197598253275109, + "grad_norm": 0.1506137251853943, + "learning_rate": 8.572605768681546e-07, + "loss": 0.13995139598846434, + "step": 5055 + }, + { + "epoch": 0.9206695778748181, + "grad_norm": 0.17772039771080017, + "learning_rate": 8.382547731053708e-07, + "loss": 0.14470311403274536, + "step": 5060 + }, + { + "epoch": 0.9215793304221251, + "grad_norm": 0.19897456467151642, + "learning_rate": 8.194584282787382e-07, + "loss": 0.144488525390625, + "step": 5065 + }, + { + "epoch": 0.9224890829694323, + "grad_norm": 0.15899236500263214, + "learning_rate": 8.008717053343606e-07, + "loss": 0.1352991580963135, + "step": 5070 + }, + { + "epoch": 0.9233988355167394, + "grad_norm": 0.14965768158435822, + "learning_rate": 7.824947654011345e-07, + "loss": 0.13827911615371705, + "step": 5075 + }, + { + "epoch": 0.9243085880640466, + "grad_norm": 0.43651485443115234, + "learning_rate": 7.643277677893329e-07, + "loss": 0.14149526357650757, + "step": 5080 + }, + { + "epoch": 0.9252183406113537, + "grad_norm": 0.19912713766098022, + "learning_rate": 7.463708699892325e-07, + "loss": 0.14357032775878906, + "step": 5085 + }, + { + "epoch": 0.9261280931586608, + "grad_norm": 0.1635904610157013, + "learning_rate": 7.286242276697524e-07, + "loss": 0.13550699949264527, + "step": 5090 + }, + { + "epoch": 0.9270378457059679, + "grad_norm": 0.19391080737113953, + "learning_rate": 7.11087994677101e-07, + "loss": 0.14674756526947022, + "step": 5095 + }, + { + "epoch": 0.9279475982532751, + "grad_norm": 0.17458125948905945, + "learning_rate": 6.937623230334284e-07, + "loss": 0.14155579805374147, + "step": 5100 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.80383382252852e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5100/training_args.bin b/checkpoint-5100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-5100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-5200/README.md b/checkpoint-5200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-5200/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-5200/adapter_config.json b/checkpoint-5200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-5200/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5200/adapter_model.safetensors b/checkpoint-5200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0a8d989d01ea447e029b7061f74fd9b090841cb4 --- /dev/null +++ b/checkpoint-5200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4132af238f49ee03df7f66e26b8f70f7512186b9a13bfe82a31c1aed04cd19 +size 169741912 diff --git a/checkpoint-5200/chat_template.jinja b/checkpoint-5200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-5200/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-5200/optimizer.pt b/checkpoint-5200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..939f4536c13053ee071dab2632e2741ebcc25c50 --- /dev/null +++ b/checkpoint-5200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7172af83fe1eb61895afe57e7fa607cb3b9c14fffa96ff4215f95b993b3cf93d +size 72807355 diff --git a/checkpoint-5200/processor_config.json b/checkpoint-5200/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-5200/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-5200/rng_state.pth b/checkpoint-5200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-5200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-5200/scheduler.pt b/checkpoint-5200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bdaf72ea9881a24b87c7c0436cdc9643e84d0692 --- /dev/null +++ b/checkpoint-5200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a5ffec2bd06b88fab5cacdc3271508d4c91b0c41f2c3d99454223ab1ca20a73 +size 1465 diff --git a/checkpoint-5200/tokenizer.json b/checkpoint-5200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-5200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-5200/tokenizer_config.json b/checkpoint-5200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-5200/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-5200/trainer_state.json b/checkpoint-5200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..91a801a13367f0bd58b4800a43761fda3910d13c --- /dev/null +++ b/checkpoint-5200/trainer_state.json @@ -0,0 +1,7322 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9461426491994177, + "eval_steps": 100, + "global_step": 5200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + }, + { + "epoch": 0.8560771470160117, + "grad_norm": 0.17895208299160004, + "learning_rate": 2.6965019219562155e-06, + "loss": 0.13971571922302245, + "step": 4705 + }, + { + "epoch": 0.8569868995633187, + "grad_norm": 0.22735828161239624, + "learning_rate": 2.6633455210614055e-06, + "loss": 0.13776102066040039, + "step": 4710 + }, + { + "epoch": 0.8578966521106259, + "grad_norm": 0.16779793798923492, + "learning_rate": 2.630382757333133e-06, + "loss": 0.14134042263031005, + "step": 4715 + }, + { + "epoch": 0.858806404657933, + "grad_norm": 0.2148888260126114, + "learning_rate": 2.597613916526637e-06, + "loss": 0.14680721759796142, + "step": 4720 + }, + { + "epoch": 0.8597161572052402, + "grad_norm": 0.16560257971286774, + "learning_rate": 2.565039282716045e-06, + "loss": 0.14137234687805175, + "step": 4725 + }, + { + "epoch": 0.8606259097525473, + "grad_norm": 0.16197068989276886, + "learning_rate": 2.532659138291879e-06, + "loss": 0.14969314336776735, + "step": 4730 + }, + { + "epoch": 0.8615356622998545, + "grad_norm": 0.14650246500968933, + "learning_rate": 2.5004737639586497e-06, + "loss": 0.13532910346984864, + "step": 4735 + }, + { + "epoch": 0.8624454148471615, + "grad_norm": 0.1565634310245514, + "learning_rate": 2.4684834387323943e-06, + "loss": 0.14146244525909424, + "step": 4740 + }, + { + "epoch": 0.8633551673944687, + "grad_norm": 0.18060864508152008, + "learning_rate": 2.4366884399382393e-06, + "loss": 0.14218534231185914, + "step": 4745 + }, + { + "epoch": 0.8642649199417758, + "grad_norm": 0.24613255262374878, + "learning_rate": 2.4050890432080557e-06, + "loss": 0.13907679319381713, + "step": 4750 + }, + { + "epoch": 0.865174672489083, + "grad_norm": 0.16036023199558258, + "learning_rate": 2.3736855224780057e-06, + "loss": 0.13718113899230958, + "step": 4755 + }, + { + "epoch": 0.86608442503639, + "grad_norm": 0.16678516566753387, + "learning_rate": 2.3424781499862075e-06, + "loss": 0.1327962040901184, + "step": 4760 + }, + { + "epoch": 0.8669941775836972, + "grad_norm": 0.1763770878314972, + "learning_rate": 2.3114671962703727e-06, + "loss": 0.14390318393707274, + "step": 4765 + }, + { + "epoch": 0.8679039301310044, + "grad_norm": 0.17735697329044342, + "learning_rate": 2.280652930165428e-06, + "loss": 0.15223288536071777, + "step": 4770 + }, + { + "epoch": 0.8688136826783115, + "grad_norm": 0.15827041864395142, + "learning_rate": 2.250035618801241e-06, + "loss": 0.14296332597732545, + "step": 4775 + }, + { + "epoch": 0.8697234352256187, + "grad_norm": 0.16876135766506195, + "learning_rate": 2.219615527600244e-06, + "loss": 0.1359076738357544, + "step": 4780 + }, + { + "epoch": 0.8706331877729258, + "grad_norm": 0.1800110638141632, + "learning_rate": 2.189392920275174e-06, + "loss": 0.1424281358718872, + "step": 4785 + }, + { + "epoch": 0.8715429403202329, + "grad_norm": 0.1409560889005661, + "learning_rate": 2.159368058826783e-06, + "loss": 0.14480490684509278, + "step": 4790 + }, + { + "epoch": 0.87245269286754, + "grad_norm": 0.1634288728237152, + "learning_rate": 2.129541203541535e-06, + "loss": 0.14513269662857056, + "step": 4795 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.17126062512397766, + "learning_rate": 2.099912612989391e-06, + "loss": 0.13546934127807617, + "step": 4800 + }, + { + "epoch": 0.8742721979621543, + "grad_norm": 0.16704080998897552, + "learning_rate": 2.0704825440215457e-06, + "loss": 0.13852492570877076, + "step": 4805 + }, + { + "epoch": 0.8751819505094615, + "grad_norm": 0.1725970208644867, + "learning_rate": 2.0412512517681946e-06, + "loss": 0.14504197835922242, + "step": 4810 + }, + { + "epoch": 0.8760917030567685, + "grad_norm": 0.1700201779603958, + "learning_rate": 2.0122189896363387e-06, + "loss": 0.14312338829040527, + "step": 4815 + }, + { + "epoch": 0.8770014556040757, + "grad_norm": 0.16491736471652985, + "learning_rate": 1.9833860093075834e-06, + "loss": 0.14062976837158203, + "step": 4820 + }, + { + "epoch": 0.8779112081513828, + "grad_norm": 0.13748787343502045, + "learning_rate": 1.9547525607359537e-06, + "loss": 0.1346171498298645, + "step": 4825 + }, + { + "epoch": 0.87882096069869, + "grad_norm": 0.16399399936199188, + "learning_rate": 1.926318892145712e-06, + "loss": 0.14178123474121093, + "step": 4830 + }, + { + "epoch": 0.879730713245997, + "grad_norm": 0.14491963386535645, + "learning_rate": 1.8980852500292412e-06, + "loss": 0.1408564567565918, + "step": 4835 + }, + { + "epoch": 0.8806404657933042, + "grad_norm": 0.17335423827171326, + "learning_rate": 1.8700518791448851e-06, + "loss": 0.14403265714645386, + "step": 4840 + }, + { + "epoch": 0.8815502183406113, + "grad_norm": 0.17399625480175018, + "learning_rate": 1.8422190225148155e-06, + "loss": 0.14289036989212037, + "step": 4845 + }, + { + "epoch": 0.8824599708879185, + "grad_norm": 0.17945612967014313, + "learning_rate": 1.814586921422956e-06, + "loss": 0.14494109153747559, + "step": 4850 + }, + { + "epoch": 0.8833697234352256, + "grad_norm": 0.1910620480775833, + "learning_rate": 1.7871558154128664e-06, + "loss": 0.13726245164871215, + "step": 4855 + }, + { + "epoch": 0.8842794759825328, + "grad_norm": 0.1771879345178604, + "learning_rate": 1.7599259422856756e-06, + "loss": 0.1464752197265625, + "step": 4860 + }, + { + "epoch": 0.8851892285298398, + "grad_norm": 0.19427461922168732, + "learning_rate": 1.7328975380980218e-06, + "loss": 0.13823356628417968, + "step": 4865 + }, + { + "epoch": 0.886098981077147, + "grad_norm": 0.1491149365901947, + "learning_rate": 1.7060708371599897e-06, + "loss": 0.1338604211807251, + "step": 4870 + }, + { + "epoch": 0.8870087336244541, + "grad_norm": 0.16087733209133148, + "learning_rate": 1.6794460720331057e-06, + "loss": 0.14184389114379883, + "step": 4875 + }, + { + "epoch": 0.8879184861717613, + "grad_norm": 0.14506325125694275, + "learning_rate": 1.653023473528309e-06, + "loss": 0.14267687797546386, + "step": 4880 + }, + { + "epoch": 0.8888282387190685, + "grad_norm": 0.16886365413665771, + "learning_rate": 1.626803270703936e-06, + "loss": 0.14266083240509034, + "step": 4885 + }, + { + "epoch": 0.8897379912663755, + "grad_norm": 0.1891999989748001, + "learning_rate": 1.6007856908637652e-06, + "loss": 0.1398016929626465, + "step": 4890 + }, + { + "epoch": 0.8906477438136827, + "grad_norm": 0.17645299434661865, + "learning_rate": 1.5749709595550083e-06, + "loss": 0.13869571685791016, + "step": 4895 + }, + { + "epoch": 0.8915574963609898, + "grad_norm": 0.17714262008666992, + "learning_rate": 1.549359300566408e-06, + "loss": 0.14957486391067504, + "step": 4900 + }, + { + "epoch": 0.892467248908297, + "grad_norm": 0.18025240302085876, + "learning_rate": 1.5239509359262355e-06, + "loss": 0.1358652949333191, + "step": 4905 + }, + { + "epoch": 0.8933770014556041, + "grad_norm": 0.17539937794208527, + "learning_rate": 1.4987460859004154e-06, + "loss": 0.13833394050598144, + "step": 4910 + }, + { + "epoch": 0.8942867540029112, + "grad_norm": 0.1772230565547943, + "learning_rate": 1.4737449689905953e-06, + "loss": 0.14202116727828978, + "step": 4915 + }, + { + "epoch": 0.8951965065502183, + "grad_norm": 0.1670161783695221, + "learning_rate": 1.4489478019322433e-06, + "loss": 0.1403665542602539, + "step": 4920 + }, + { + "epoch": 0.8961062590975255, + "grad_norm": 0.1697034239768982, + "learning_rate": 1.4243547996927926e-06, + "loss": 0.1401481032371521, + "step": 4925 + }, + { + "epoch": 0.8970160116448326, + "grad_norm": 0.16474860906600952, + "learning_rate": 1.3999661754697636e-06, + "loss": 0.13969850540161133, + "step": 4930 + }, + { + "epoch": 0.8979257641921398, + "grad_norm": 0.1664883941411972, + "learning_rate": 1.3757821406889027e-06, + "loss": 0.1399069309234619, + "step": 4935 + }, + { + "epoch": 0.8988355167394468, + "grad_norm": 0.16675794124603271, + "learning_rate": 1.351802905002386e-06, + "loss": 0.14129226207733153, + "step": 4940 + }, + { + "epoch": 0.899745269286754, + "grad_norm": 0.17529809474945068, + "learning_rate": 1.3280286762869632e-06, + "loss": 0.14663081169128417, + "step": 4945 + }, + { + "epoch": 0.9006550218340611, + "grad_norm": 0.17758169770240784, + "learning_rate": 1.3044596606421795e-06, + "loss": 0.13986254930496217, + "step": 4950 + }, + { + "epoch": 0.9015647743813683, + "grad_norm": 0.153225839138031, + "learning_rate": 1.2810960623885815e-06, + "loss": 0.14236698150634766, + "step": 4955 + }, + { + "epoch": 0.9024745269286754, + "grad_norm": 0.169761523604393, + "learning_rate": 1.2579380840659376e-06, + "loss": 0.1450445055961609, + "step": 4960 + }, + { + "epoch": 0.9033842794759825, + "grad_norm": 0.16659331321716309, + "learning_rate": 1.2349859264315034e-06, + "loss": 0.14043926000595092, + "step": 4965 + }, + { + "epoch": 0.9042940320232896, + "grad_norm": 0.16748706996440887, + "learning_rate": 1.2122397884582553e-06, + "loss": 0.14725675582885742, + "step": 4970 + }, + { + "epoch": 0.9052037845705968, + "grad_norm": 0.1600511223077774, + "learning_rate": 1.1896998673331883e-06, + "loss": 0.14551150798797607, + "step": 4975 + }, + { + "epoch": 0.9061135371179039, + "grad_norm": 0.24318362772464752, + "learning_rate": 1.1673663584555934e-06, + "loss": 0.14470888376235963, + "step": 4980 + }, + { + "epoch": 0.9070232896652111, + "grad_norm": 0.16443821787834167, + "learning_rate": 1.1452394554353706e-06, + "loss": 0.13639854192733764, + "step": 4985 + }, + { + "epoch": 0.9079330422125182, + "grad_norm": 0.14277774095535278, + "learning_rate": 1.1233193500913453e-06, + "loss": 0.13749881982803344, + "step": 4990 + }, + { + "epoch": 0.9088427947598253, + "grad_norm": 0.1610947549343109, + "learning_rate": 1.1016062324496008e-06, + "loss": 0.1385629653930664, + "step": 4995 + }, + { + "epoch": 0.9097525473071325, + "grad_norm": 0.17888498306274414, + "learning_rate": 1.080100290741845e-06, + "loss": 0.14225621223449708, + "step": 5000 + }, + { + "epoch": 0.9106622998544396, + "grad_norm": 0.17488449811935425, + "learning_rate": 1.0588017114037729e-06, + "loss": 0.14187805652618407, + "step": 5005 + }, + { + "epoch": 0.9115720524017468, + "grad_norm": 0.16410665214061737, + "learning_rate": 1.0377106790734392e-06, + "loss": 0.1407416582107544, + "step": 5010 + }, + { + "epoch": 0.9124818049490538, + "grad_norm": 0.18115971982479095, + "learning_rate": 1.016827376589674e-06, + "loss": 0.1427263855934143, + "step": 5015 + }, + { + "epoch": 0.913391557496361, + "grad_norm": 0.18507841229438782, + "learning_rate": 9.961519849904898e-07, + "loss": 0.1390499472618103, + "step": 5020 + }, + { + "epoch": 0.9143013100436681, + "grad_norm": 0.21296796202659607, + "learning_rate": 9.75684683511513e-07, + "loss": 0.1382216691970825, + "step": 5025 + }, + { + "epoch": 0.9152110625909753, + "grad_norm": 0.2308044582605362, + "learning_rate": 9.55425649584435e-07, + "loss": 0.14271280765533448, + "step": 5030 + }, + { + "epoch": 0.9161208151382824, + "grad_norm": 0.15796682238578796, + "learning_rate": 9.353750588354527e-07, + "loss": 0.13807624578475952, + "step": 5035 + }, + { + "epoch": 0.9170305676855895, + "grad_norm": 0.1695316582918167, + "learning_rate": 9.155330850837834e-07, + "loss": 0.14289476871490478, + "step": 5040 + }, + { + "epoch": 0.9179403202328966, + "grad_norm": 0.1738404780626297, + "learning_rate": 8.958999003401191e-07, + "loss": 0.14070619344711305, + "step": 5045 + }, + { + "epoch": 0.9188500727802038, + "grad_norm": 0.20618964731693268, + "learning_rate": 8.764756748051662e-07, + "loss": 0.14535053968429565, + "step": 5050 + }, + { + "epoch": 0.9197598253275109, + "grad_norm": 0.1506137251853943, + "learning_rate": 8.572605768681546e-07, + "loss": 0.13995139598846434, + "step": 5055 + }, + { + "epoch": 0.9206695778748181, + "grad_norm": 0.17772039771080017, + "learning_rate": 8.382547731053708e-07, + "loss": 0.14470311403274536, + "step": 5060 + }, + { + "epoch": 0.9215793304221251, + "grad_norm": 0.19897456467151642, + "learning_rate": 8.194584282787382e-07, + "loss": 0.144488525390625, + "step": 5065 + }, + { + "epoch": 0.9224890829694323, + "grad_norm": 0.15899236500263214, + "learning_rate": 8.008717053343606e-07, + "loss": 0.1352991580963135, + "step": 5070 + }, + { + "epoch": 0.9233988355167394, + "grad_norm": 0.14965768158435822, + "learning_rate": 7.824947654011345e-07, + "loss": 0.13827911615371705, + "step": 5075 + }, + { + "epoch": 0.9243085880640466, + "grad_norm": 0.43651485443115234, + "learning_rate": 7.643277677893329e-07, + "loss": 0.14149526357650757, + "step": 5080 + }, + { + "epoch": 0.9252183406113537, + "grad_norm": 0.19912713766098022, + "learning_rate": 7.463708699892325e-07, + "loss": 0.14357032775878906, + "step": 5085 + }, + { + "epoch": 0.9261280931586608, + "grad_norm": 0.1635904610157013, + "learning_rate": 7.286242276697524e-07, + "loss": 0.13550699949264527, + "step": 5090 + }, + { + "epoch": 0.9270378457059679, + "grad_norm": 0.19391080737113953, + "learning_rate": 7.11087994677101e-07, + "loss": 0.14674756526947022, + "step": 5095 + }, + { + "epoch": 0.9279475982532751, + "grad_norm": 0.17458125948905945, + "learning_rate": 6.937623230334284e-07, + "loss": 0.14155579805374147, + "step": 5100 + }, + { + "epoch": 0.9288573508005823, + "grad_norm": 0.1617971807718277, + "learning_rate": 6.766473629355452e-07, + "loss": 0.140555477142334, + "step": 5105 + }, + { + "epoch": 0.9297671033478894, + "grad_norm": 0.16945427656173706, + "learning_rate": 6.59743262753576e-07, + "loss": 0.13607511520385743, + "step": 5110 + }, + { + "epoch": 0.9306768558951966, + "grad_norm": 0.18347840011119843, + "learning_rate": 6.43050169029702e-07, + "loss": 0.14903461933135986, + "step": 5115 + }, + { + "epoch": 0.9315866084425036, + "grad_norm": 0.15434837341308594, + "learning_rate": 6.265682264768869e-07, + "loss": 0.14146015644073487, + "step": 5120 + }, + { + "epoch": 0.9324963609898108, + "grad_norm": 0.1397712528705597, + "learning_rate": 6.10297577977606e-07, + "loss": 0.14261592626571656, + "step": 5125 + }, + { + "epoch": 0.9334061135371179, + "grad_norm": 0.1765873283147812, + "learning_rate": 5.942383645826361e-07, + "loss": 0.13559447526931762, + "step": 5130 + }, + { + "epoch": 0.9343158660844251, + "grad_norm": 0.1656057983636856, + "learning_rate": 5.783907255098003e-07, + "loss": 0.13961490392684936, + "step": 5135 + }, + { + "epoch": 0.9352256186317321, + "grad_norm": 0.2169366180896759, + "learning_rate": 5.627547981427894e-07, + "loss": 0.1447835922241211, + "step": 5140 + }, + { + "epoch": 0.9361353711790393, + "grad_norm": 0.18623125553131104, + "learning_rate": 5.473307180299508e-07, + "loss": 0.14366730451583862, + "step": 5145 + }, + { + "epoch": 0.9370451237263464, + "grad_norm": 0.15423963963985443, + "learning_rate": 5.32118618883129e-07, + "loss": 0.14295632839202882, + "step": 5150 + }, + { + "epoch": 0.9379548762736536, + "grad_norm": 0.18423247337341309, + "learning_rate": 5.17118632576491e-07, + "loss": 0.14137414693832398, + "step": 5155 + }, + { + "epoch": 0.9388646288209607, + "grad_norm": 0.15338757634162903, + "learning_rate": 5.023308891453915e-07, + "loss": 0.13583066463470458, + "step": 5160 + }, + { + "epoch": 0.9397743813682679, + "grad_norm": 0.2293633222579956, + "learning_rate": 4.877555167852515e-07, + "loss": 0.14819620847702025, + "step": 5165 + }, + { + "epoch": 0.9406841339155749, + "grad_norm": 0.16889944672584534, + "learning_rate": 4.7339264185043974e-07, + "loss": 0.13617686033248902, + "step": 5170 + }, + { + "epoch": 0.9415938864628821, + "grad_norm": 0.1767464578151703, + "learning_rate": 4.5924238885316775e-07, + "loss": 0.13487552404403685, + "step": 5175 + }, + { + "epoch": 0.9425036390101892, + "grad_norm": 0.16697899997234344, + "learning_rate": 4.453048804624327e-07, + "loss": 0.1446886420249939, + "step": 5180 + }, + { + "epoch": 0.9434133915574964, + "grad_norm": 0.19576266407966614, + "learning_rate": 4.315802375029293e-07, + "loss": 0.14252450466156005, + "step": 5185 + }, + { + "epoch": 0.9443231441048034, + "grad_norm": 0.14838077127933502, + "learning_rate": 4.18068578954034e-07, + "loss": 0.13933032751083374, + "step": 5190 + }, + { + "epoch": 0.9452328966521106, + "grad_norm": 0.18481744825839996, + "learning_rate": 4.047700219487388e-07, + "loss": 0.1410665273666382, + "step": 5195 + }, + { + "epoch": 0.9461426491994177, + "grad_norm": 0.16954176127910614, + "learning_rate": 3.9168468177265547e-07, + "loss": 0.1421758770942688, + "step": 5200 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.857910203994113e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5200/training_args.bin b/checkpoint-5200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-5200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-5300/README.md b/checkpoint-5300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-5300/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-5300/adapter_config.json b/checkpoint-5300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-5300/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5300/adapter_model.safetensors b/checkpoint-5300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fefa75a7bec8a4d2fb4a21708e4fcf1074bfedbe --- /dev/null +++ b/checkpoint-5300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d134e5bd0379593af8640a9695296c13be790cd99b98f86152deb3956e3d52 +size 169741912 diff --git a/checkpoint-5300/chat_template.jinja b/checkpoint-5300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-5300/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-5300/optimizer.pt b/checkpoint-5300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..652eb0c69fcac1c906678deeee2e36cb5ccf5b1f --- /dev/null +++ b/checkpoint-5300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ab24f6e8be492fde0cbb674956956b52430e2856ceb6e24d1dccd0203a0255a +size 72807355 diff --git a/checkpoint-5300/processor_config.json b/checkpoint-5300/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-5300/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-5300/rng_state.pth b/checkpoint-5300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-5300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-5300/scheduler.pt b/checkpoint-5300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cefb749dd36fecbf51d2bd3f5e120fbe964a5179 --- /dev/null +++ b/checkpoint-5300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e15f40cb3aebdb433d431e37533a02b8207ffc5eeb3fa5f208e35510462495e +size 1465 diff --git a/checkpoint-5300/tokenizer.json b/checkpoint-5300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-5300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-5300/tokenizer_config.json b/checkpoint-5300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-5300/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-5300/trainer_state.json b/checkpoint-5300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..01c06afce9b8addb755964322512fc600e853d03 --- /dev/null +++ b/checkpoint-5300/trainer_state.json @@ -0,0 +1,7462 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9643377001455604, + "eval_steps": 100, + "global_step": 5300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + }, + { + "epoch": 0.8560771470160117, + "grad_norm": 0.17895208299160004, + "learning_rate": 2.6965019219562155e-06, + "loss": 0.13971571922302245, + "step": 4705 + }, + { + "epoch": 0.8569868995633187, + "grad_norm": 0.22735828161239624, + "learning_rate": 2.6633455210614055e-06, + "loss": 0.13776102066040039, + "step": 4710 + }, + { + "epoch": 0.8578966521106259, + "grad_norm": 0.16779793798923492, + "learning_rate": 2.630382757333133e-06, + "loss": 0.14134042263031005, + "step": 4715 + }, + { + "epoch": 0.858806404657933, + "grad_norm": 0.2148888260126114, + "learning_rate": 2.597613916526637e-06, + "loss": 0.14680721759796142, + "step": 4720 + }, + { + "epoch": 0.8597161572052402, + "grad_norm": 0.16560257971286774, + "learning_rate": 2.565039282716045e-06, + "loss": 0.14137234687805175, + "step": 4725 + }, + { + "epoch": 0.8606259097525473, + "grad_norm": 0.16197068989276886, + "learning_rate": 2.532659138291879e-06, + "loss": 0.14969314336776735, + "step": 4730 + }, + { + "epoch": 0.8615356622998545, + "grad_norm": 0.14650246500968933, + "learning_rate": 2.5004737639586497e-06, + "loss": 0.13532910346984864, + "step": 4735 + }, + { + "epoch": 0.8624454148471615, + "grad_norm": 0.1565634310245514, + "learning_rate": 2.4684834387323943e-06, + "loss": 0.14146244525909424, + "step": 4740 + }, + { + "epoch": 0.8633551673944687, + "grad_norm": 0.18060864508152008, + "learning_rate": 2.4366884399382393e-06, + "loss": 0.14218534231185914, + "step": 4745 + }, + { + "epoch": 0.8642649199417758, + "grad_norm": 0.24613255262374878, + "learning_rate": 2.4050890432080557e-06, + "loss": 0.13907679319381713, + "step": 4750 + }, + { + "epoch": 0.865174672489083, + "grad_norm": 0.16036023199558258, + "learning_rate": 2.3736855224780057e-06, + "loss": 0.13718113899230958, + "step": 4755 + }, + { + "epoch": 0.86608442503639, + "grad_norm": 0.16678516566753387, + "learning_rate": 2.3424781499862075e-06, + "loss": 0.1327962040901184, + "step": 4760 + }, + { + "epoch": 0.8669941775836972, + "grad_norm": 0.1763770878314972, + "learning_rate": 2.3114671962703727e-06, + "loss": 0.14390318393707274, + "step": 4765 + }, + { + "epoch": 0.8679039301310044, + "grad_norm": 0.17735697329044342, + "learning_rate": 2.280652930165428e-06, + "loss": 0.15223288536071777, + "step": 4770 + }, + { + "epoch": 0.8688136826783115, + "grad_norm": 0.15827041864395142, + "learning_rate": 2.250035618801241e-06, + "loss": 0.14296332597732545, + "step": 4775 + }, + { + "epoch": 0.8697234352256187, + "grad_norm": 0.16876135766506195, + "learning_rate": 2.219615527600244e-06, + "loss": 0.1359076738357544, + "step": 4780 + }, + { + "epoch": 0.8706331877729258, + "grad_norm": 0.1800110638141632, + "learning_rate": 2.189392920275174e-06, + "loss": 0.1424281358718872, + "step": 4785 + }, + { + "epoch": 0.8715429403202329, + "grad_norm": 0.1409560889005661, + "learning_rate": 2.159368058826783e-06, + "loss": 0.14480490684509278, + "step": 4790 + }, + { + "epoch": 0.87245269286754, + "grad_norm": 0.1634288728237152, + "learning_rate": 2.129541203541535e-06, + "loss": 0.14513269662857056, + "step": 4795 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.17126062512397766, + "learning_rate": 2.099912612989391e-06, + "loss": 0.13546934127807617, + "step": 4800 + }, + { + "epoch": 0.8742721979621543, + "grad_norm": 0.16704080998897552, + "learning_rate": 2.0704825440215457e-06, + "loss": 0.13852492570877076, + "step": 4805 + }, + { + "epoch": 0.8751819505094615, + "grad_norm": 0.1725970208644867, + "learning_rate": 2.0412512517681946e-06, + "loss": 0.14504197835922242, + "step": 4810 + }, + { + "epoch": 0.8760917030567685, + "grad_norm": 0.1700201779603958, + "learning_rate": 2.0122189896363387e-06, + "loss": 0.14312338829040527, + "step": 4815 + }, + { + "epoch": 0.8770014556040757, + "grad_norm": 0.16491736471652985, + "learning_rate": 1.9833860093075834e-06, + "loss": 0.14062976837158203, + "step": 4820 + }, + { + "epoch": 0.8779112081513828, + "grad_norm": 0.13748787343502045, + "learning_rate": 1.9547525607359537e-06, + "loss": 0.1346171498298645, + "step": 4825 + }, + { + "epoch": 0.87882096069869, + "grad_norm": 0.16399399936199188, + "learning_rate": 1.926318892145712e-06, + "loss": 0.14178123474121093, + "step": 4830 + }, + { + "epoch": 0.879730713245997, + "grad_norm": 0.14491963386535645, + "learning_rate": 1.8980852500292412e-06, + "loss": 0.1408564567565918, + "step": 4835 + }, + { + "epoch": 0.8806404657933042, + "grad_norm": 0.17335423827171326, + "learning_rate": 1.8700518791448851e-06, + "loss": 0.14403265714645386, + "step": 4840 + }, + { + "epoch": 0.8815502183406113, + "grad_norm": 0.17399625480175018, + "learning_rate": 1.8422190225148155e-06, + "loss": 0.14289036989212037, + "step": 4845 + }, + { + "epoch": 0.8824599708879185, + "grad_norm": 0.17945612967014313, + "learning_rate": 1.814586921422956e-06, + "loss": 0.14494109153747559, + "step": 4850 + }, + { + "epoch": 0.8833697234352256, + "grad_norm": 0.1910620480775833, + "learning_rate": 1.7871558154128664e-06, + "loss": 0.13726245164871215, + "step": 4855 + }, + { + "epoch": 0.8842794759825328, + "grad_norm": 0.1771879345178604, + "learning_rate": 1.7599259422856756e-06, + "loss": 0.1464752197265625, + "step": 4860 + }, + { + "epoch": 0.8851892285298398, + "grad_norm": 0.19427461922168732, + "learning_rate": 1.7328975380980218e-06, + "loss": 0.13823356628417968, + "step": 4865 + }, + { + "epoch": 0.886098981077147, + "grad_norm": 0.1491149365901947, + "learning_rate": 1.7060708371599897e-06, + "loss": 0.1338604211807251, + "step": 4870 + }, + { + "epoch": 0.8870087336244541, + "grad_norm": 0.16087733209133148, + "learning_rate": 1.6794460720331057e-06, + "loss": 0.14184389114379883, + "step": 4875 + }, + { + "epoch": 0.8879184861717613, + "grad_norm": 0.14506325125694275, + "learning_rate": 1.653023473528309e-06, + "loss": 0.14267687797546386, + "step": 4880 + }, + { + "epoch": 0.8888282387190685, + "grad_norm": 0.16886365413665771, + "learning_rate": 1.626803270703936e-06, + "loss": 0.14266083240509034, + "step": 4885 + }, + { + "epoch": 0.8897379912663755, + "grad_norm": 0.1891999989748001, + "learning_rate": 1.6007856908637652e-06, + "loss": 0.1398016929626465, + "step": 4890 + }, + { + "epoch": 0.8906477438136827, + "grad_norm": 0.17645299434661865, + "learning_rate": 1.5749709595550083e-06, + "loss": 0.13869571685791016, + "step": 4895 + }, + { + "epoch": 0.8915574963609898, + "grad_norm": 0.17714262008666992, + "learning_rate": 1.549359300566408e-06, + "loss": 0.14957486391067504, + "step": 4900 + }, + { + "epoch": 0.892467248908297, + "grad_norm": 0.18025240302085876, + "learning_rate": 1.5239509359262355e-06, + "loss": 0.1358652949333191, + "step": 4905 + }, + { + "epoch": 0.8933770014556041, + "grad_norm": 0.17539937794208527, + "learning_rate": 1.4987460859004154e-06, + "loss": 0.13833394050598144, + "step": 4910 + }, + { + "epoch": 0.8942867540029112, + "grad_norm": 0.1772230565547943, + "learning_rate": 1.4737449689905953e-06, + "loss": 0.14202116727828978, + "step": 4915 + }, + { + "epoch": 0.8951965065502183, + "grad_norm": 0.1670161783695221, + "learning_rate": 1.4489478019322433e-06, + "loss": 0.1403665542602539, + "step": 4920 + }, + { + "epoch": 0.8961062590975255, + "grad_norm": 0.1697034239768982, + "learning_rate": 1.4243547996927926e-06, + "loss": 0.1401481032371521, + "step": 4925 + }, + { + "epoch": 0.8970160116448326, + "grad_norm": 0.16474860906600952, + "learning_rate": 1.3999661754697636e-06, + "loss": 0.13969850540161133, + "step": 4930 + }, + { + "epoch": 0.8979257641921398, + "grad_norm": 0.1664883941411972, + "learning_rate": 1.3757821406889027e-06, + "loss": 0.1399069309234619, + "step": 4935 + }, + { + "epoch": 0.8988355167394468, + "grad_norm": 0.16675794124603271, + "learning_rate": 1.351802905002386e-06, + "loss": 0.14129226207733153, + "step": 4940 + }, + { + "epoch": 0.899745269286754, + "grad_norm": 0.17529809474945068, + "learning_rate": 1.3280286762869632e-06, + "loss": 0.14663081169128417, + "step": 4945 + }, + { + "epoch": 0.9006550218340611, + "grad_norm": 0.17758169770240784, + "learning_rate": 1.3044596606421795e-06, + "loss": 0.13986254930496217, + "step": 4950 + }, + { + "epoch": 0.9015647743813683, + "grad_norm": 0.153225839138031, + "learning_rate": 1.2810960623885815e-06, + "loss": 0.14236698150634766, + "step": 4955 + }, + { + "epoch": 0.9024745269286754, + "grad_norm": 0.169761523604393, + "learning_rate": 1.2579380840659376e-06, + "loss": 0.1450445055961609, + "step": 4960 + }, + { + "epoch": 0.9033842794759825, + "grad_norm": 0.16659331321716309, + "learning_rate": 1.2349859264315034e-06, + "loss": 0.14043926000595092, + "step": 4965 + }, + { + "epoch": 0.9042940320232896, + "grad_norm": 0.16748706996440887, + "learning_rate": 1.2122397884582553e-06, + "loss": 0.14725675582885742, + "step": 4970 + }, + { + "epoch": 0.9052037845705968, + "grad_norm": 0.1600511223077774, + "learning_rate": 1.1896998673331883e-06, + "loss": 0.14551150798797607, + "step": 4975 + }, + { + "epoch": 0.9061135371179039, + "grad_norm": 0.24318362772464752, + "learning_rate": 1.1673663584555934e-06, + "loss": 0.14470888376235963, + "step": 4980 + }, + { + "epoch": 0.9070232896652111, + "grad_norm": 0.16443821787834167, + "learning_rate": 1.1452394554353706e-06, + "loss": 0.13639854192733764, + "step": 4985 + }, + { + "epoch": 0.9079330422125182, + "grad_norm": 0.14277774095535278, + "learning_rate": 1.1233193500913453e-06, + "loss": 0.13749881982803344, + "step": 4990 + }, + { + "epoch": 0.9088427947598253, + "grad_norm": 0.1610947549343109, + "learning_rate": 1.1016062324496008e-06, + "loss": 0.1385629653930664, + "step": 4995 + }, + { + "epoch": 0.9097525473071325, + "grad_norm": 0.17888498306274414, + "learning_rate": 1.080100290741845e-06, + "loss": 0.14225621223449708, + "step": 5000 + }, + { + "epoch": 0.9106622998544396, + "grad_norm": 0.17488449811935425, + "learning_rate": 1.0588017114037729e-06, + "loss": 0.14187805652618407, + "step": 5005 + }, + { + "epoch": 0.9115720524017468, + "grad_norm": 0.16410665214061737, + "learning_rate": 1.0377106790734392e-06, + "loss": 0.1407416582107544, + "step": 5010 + }, + { + "epoch": 0.9124818049490538, + "grad_norm": 0.18115971982479095, + "learning_rate": 1.016827376589674e-06, + "loss": 0.1427263855934143, + "step": 5015 + }, + { + "epoch": 0.913391557496361, + "grad_norm": 0.18507841229438782, + "learning_rate": 9.961519849904898e-07, + "loss": 0.1390499472618103, + "step": 5020 + }, + { + "epoch": 0.9143013100436681, + "grad_norm": 0.21296796202659607, + "learning_rate": 9.75684683511513e-07, + "loss": 0.1382216691970825, + "step": 5025 + }, + { + "epoch": 0.9152110625909753, + "grad_norm": 0.2308044582605362, + "learning_rate": 9.55425649584435e-07, + "loss": 0.14271280765533448, + "step": 5030 + }, + { + "epoch": 0.9161208151382824, + "grad_norm": 0.15796682238578796, + "learning_rate": 9.353750588354527e-07, + "loss": 0.13807624578475952, + "step": 5035 + }, + { + "epoch": 0.9170305676855895, + "grad_norm": 0.1695316582918167, + "learning_rate": 9.155330850837834e-07, + "loss": 0.14289476871490478, + "step": 5040 + }, + { + "epoch": 0.9179403202328966, + "grad_norm": 0.1738404780626297, + "learning_rate": 8.958999003401191e-07, + "loss": 0.14070619344711305, + "step": 5045 + }, + { + "epoch": 0.9188500727802038, + "grad_norm": 0.20618964731693268, + "learning_rate": 8.764756748051662e-07, + "loss": 0.14535053968429565, + "step": 5050 + }, + { + "epoch": 0.9197598253275109, + "grad_norm": 0.1506137251853943, + "learning_rate": 8.572605768681546e-07, + "loss": 0.13995139598846434, + "step": 5055 + }, + { + "epoch": 0.9206695778748181, + "grad_norm": 0.17772039771080017, + "learning_rate": 8.382547731053708e-07, + "loss": 0.14470311403274536, + "step": 5060 + }, + { + "epoch": 0.9215793304221251, + "grad_norm": 0.19897456467151642, + "learning_rate": 8.194584282787382e-07, + "loss": 0.144488525390625, + "step": 5065 + }, + { + "epoch": 0.9224890829694323, + "grad_norm": 0.15899236500263214, + "learning_rate": 8.008717053343606e-07, + "loss": 0.1352991580963135, + "step": 5070 + }, + { + "epoch": 0.9233988355167394, + "grad_norm": 0.14965768158435822, + "learning_rate": 7.824947654011345e-07, + "loss": 0.13827911615371705, + "step": 5075 + }, + { + "epoch": 0.9243085880640466, + "grad_norm": 0.43651485443115234, + "learning_rate": 7.643277677893329e-07, + "loss": 0.14149526357650757, + "step": 5080 + }, + { + "epoch": 0.9252183406113537, + "grad_norm": 0.19912713766098022, + "learning_rate": 7.463708699892325e-07, + "loss": 0.14357032775878906, + "step": 5085 + }, + { + "epoch": 0.9261280931586608, + "grad_norm": 0.1635904610157013, + "learning_rate": 7.286242276697524e-07, + "loss": 0.13550699949264527, + "step": 5090 + }, + { + "epoch": 0.9270378457059679, + "grad_norm": 0.19391080737113953, + "learning_rate": 7.11087994677101e-07, + "loss": 0.14674756526947022, + "step": 5095 + }, + { + "epoch": 0.9279475982532751, + "grad_norm": 0.17458125948905945, + "learning_rate": 6.937623230334284e-07, + "loss": 0.14155579805374147, + "step": 5100 + }, + { + "epoch": 0.9288573508005823, + "grad_norm": 0.1617971807718277, + "learning_rate": 6.766473629355452e-07, + "loss": 0.140555477142334, + "step": 5105 + }, + { + "epoch": 0.9297671033478894, + "grad_norm": 0.16945427656173706, + "learning_rate": 6.59743262753576e-07, + "loss": 0.13607511520385743, + "step": 5110 + }, + { + "epoch": 0.9306768558951966, + "grad_norm": 0.18347840011119843, + "learning_rate": 6.43050169029702e-07, + "loss": 0.14903461933135986, + "step": 5115 + }, + { + "epoch": 0.9315866084425036, + "grad_norm": 0.15434837341308594, + "learning_rate": 6.265682264768869e-07, + "loss": 0.14146015644073487, + "step": 5120 + }, + { + "epoch": 0.9324963609898108, + "grad_norm": 0.1397712528705597, + "learning_rate": 6.10297577977606e-07, + "loss": 0.14261592626571656, + "step": 5125 + }, + { + "epoch": 0.9334061135371179, + "grad_norm": 0.1765873283147812, + "learning_rate": 5.942383645826361e-07, + "loss": 0.13559447526931762, + "step": 5130 + }, + { + "epoch": 0.9343158660844251, + "grad_norm": 0.1656057983636856, + "learning_rate": 5.783907255098003e-07, + "loss": 0.13961490392684936, + "step": 5135 + }, + { + "epoch": 0.9352256186317321, + "grad_norm": 0.2169366180896759, + "learning_rate": 5.627547981427894e-07, + "loss": 0.1447835922241211, + "step": 5140 + }, + { + "epoch": 0.9361353711790393, + "grad_norm": 0.18623125553131104, + "learning_rate": 5.473307180299508e-07, + "loss": 0.14366730451583862, + "step": 5145 + }, + { + "epoch": 0.9370451237263464, + "grad_norm": 0.15423963963985443, + "learning_rate": 5.32118618883129e-07, + "loss": 0.14295632839202882, + "step": 5150 + }, + { + "epoch": 0.9379548762736536, + "grad_norm": 0.18423247337341309, + "learning_rate": 5.17118632576491e-07, + "loss": 0.14137414693832398, + "step": 5155 + }, + { + "epoch": 0.9388646288209607, + "grad_norm": 0.15338757634162903, + "learning_rate": 5.023308891453915e-07, + "loss": 0.13583066463470458, + "step": 5160 + }, + { + "epoch": 0.9397743813682679, + "grad_norm": 0.2293633222579956, + "learning_rate": 4.877555167852515e-07, + "loss": 0.14819620847702025, + "step": 5165 + }, + { + "epoch": 0.9406841339155749, + "grad_norm": 0.16889944672584534, + "learning_rate": 4.7339264185043974e-07, + "loss": 0.13617686033248902, + "step": 5170 + }, + { + "epoch": 0.9415938864628821, + "grad_norm": 0.1767464578151703, + "learning_rate": 4.5924238885316775e-07, + "loss": 0.13487552404403685, + "step": 5175 + }, + { + "epoch": 0.9425036390101892, + "grad_norm": 0.16697899997234344, + "learning_rate": 4.453048804624327e-07, + "loss": 0.1446886420249939, + "step": 5180 + }, + { + "epoch": 0.9434133915574964, + "grad_norm": 0.19576266407966614, + "learning_rate": 4.315802375029293e-07, + "loss": 0.14252450466156005, + "step": 5185 + }, + { + "epoch": 0.9443231441048034, + "grad_norm": 0.14838077127933502, + "learning_rate": 4.18068578954034e-07, + "loss": 0.13933032751083374, + "step": 5190 + }, + { + "epoch": 0.9452328966521106, + "grad_norm": 0.18481744825839996, + "learning_rate": 4.047700219487388e-07, + "loss": 0.1410665273666382, + "step": 5195 + }, + { + "epoch": 0.9461426491994177, + "grad_norm": 0.16954176127910614, + "learning_rate": 3.9168468177265547e-07, + "loss": 0.1421758770942688, + "step": 5200 + }, + { + "epoch": 0.9470524017467249, + "grad_norm": 0.17614421248435974, + "learning_rate": 3.7881267186301306e-07, + "loss": 0.14059911966323851, + "step": 5205 + }, + { + "epoch": 0.9479621542940321, + "grad_norm": 0.1637226939201355, + "learning_rate": 3.6615410380767544e-07, + "loss": 0.1360395908355713, + "step": 5210 + }, + { + "epoch": 0.9488719068413392, + "grad_norm": 0.18330250680446625, + "learning_rate": 3.5370908734417006e-07, + "loss": 0.14543824195861815, + "step": 5215 + }, + { + "epoch": 0.9497816593886463, + "grad_norm": 0.1895420402288437, + "learning_rate": 3.414777303587413e-07, + "loss": 0.15304578542709352, + "step": 5220 + }, + { + "epoch": 0.9506914119359534, + "grad_norm": 0.15384933352470398, + "learning_rate": 3.294601388854041e-07, + "loss": 0.14675912857055665, + "step": 5225 + }, + { + "epoch": 0.9516011644832606, + "grad_norm": 0.20188499987125397, + "learning_rate": 3.1765641710505e-07, + "loss": 0.14068362712860108, + "step": 5230 + }, + { + "epoch": 0.9525109170305677, + "grad_norm": 0.16467279195785522, + "learning_rate": 3.060666673445123e-07, + "loss": 0.14733167886734008, + "step": 5235 + }, + { + "epoch": 0.9534206695778749, + "grad_norm": 0.16632016003131866, + "learning_rate": 2.9469099007569943e-07, + "loss": 0.13753929138183593, + "step": 5240 + }, + { + "epoch": 0.9543304221251819, + "grad_norm": 0.1477566957473755, + "learning_rate": 2.83529483914724e-07, + "loss": 0.14354891777038575, + "step": 5245 + }, + { + "epoch": 0.9552401746724891, + "grad_norm": 0.1693645417690277, + "learning_rate": 2.7258224562102805e-07, + "loss": 0.14622807502746582, + "step": 5250 + }, + { + "epoch": 0.9561499272197962, + "grad_norm": 0.17574062943458557, + "learning_rate": 2.6184937009657295e-07, + "loss": 0.1344899296760559, + "step": 5255 + }, + { + "epoch": 0.9570596797671034, + "grad_norm": 0.17448563873767853, + "learning_rate": 2.513309503850009e-07, + "loss": 0.1355789542198181, + "step": 5260 + }, + { + "epoch": 0.9579694323144105, + "grad_norm": 0.16993778944015503, + "learning_rate": 2.41027077670819e-07, + "loss": 0.151595401763916, + "step": 5265 + }, + { + "epoch": 0.9588791848617176, + "grad_norm": 0.16944102942943573, + "learning_rate": 2.3093784127863062e-07, + "loss": 0.1466623306274414, + "step": 5270 + }, + { + "epoch": 0.9597889374090247, + "grad_norm": 0.18085163831710815, + "learning_rate": 2.2106332867234402e-07, + "loss": 0.14645814895629883, + "step": 5275 + }, + { + "epoch": 0.9606986899563319, + "grad_norm": 0.14682307839393616, + "learning_rate": 2.1140362545442605e-07, + "loss": 0.13901774883270263, + "step": 5280 + }, + { + "epoch": 0.961608442503639, + "grad_norm": 0.17189526557922363, + "learning_rate": 2.0195881536514694e-07, + "loss": 0.14153491258621215, + "step": 5285 + }, + { + "epoch": 0.9625181950509462, + "grad_norm": 0.1977207362651825, + "learning_rate": 1.9272898028186714e-07, + "loss": 0.1437437653541565, + "step": 5290 + }, + { + "epoch": 0.9634279475982532, + "grad_norm": 0.16637668013572693, + "learning_rate": 1.837142002183184e-07, + "loss": 0.13910138607025146, + "step": 5295 + }, + { + "epoch": 0.9643377001455604, + "grad_norm": 0.18155774474143982, + "learning_rate": 1.7491455332391548e-07, + "loss": 0.14177814722061158, + "step": 5300 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.91270363724079e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5300/training_args.bin b/checkpoint-5300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-5300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-5400/README.md b/checkpoint-5400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-5400/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-5400/adapter_config.json b/checkpoint-5400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-5400/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5400/adapter_model.safetensors b/checkpoint-5400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ed48240c1e1bedc059d842cc673ed7231ebea701 --- /dev/null +++ b/checkpoint-5400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a155a62a6b82ab171b303deed63e164255b839c4d35d3f4dc73a391c560321 +size 169741912 diff --git a/checkpoint-5400/chat_template.jinja b/checkpoint-5400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-5400/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-5400/optimizer.pt b/checkpoint-5400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4d02c065e044192e5fc5d300659ce1b63ec0a01 --- /dev/null +++ b/checkpoint-5400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3ce57aac76e6b3a00f67ce909a57612ce91cf2745dbc74a149dbdb1b267e843 +size 72807355 diff --git a/checkpoint-5400/processor_config.json b/checkpoint-5400/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-5400/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-5400/rng_state.pth b/checkpoint-5400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-5400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-5400/scheduler.pt b/checkpoint-5400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..66d1eba99819f04ced6a1cd66ed865261031dfc2 --- /dev/null +++ b/checkpoint-5400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2383841240df643f2cc3bec80f20fb5b80bd822a02870b923d76fb12aba5b41 +size 1465 diff --git a/checkpoint-5400/tokenizer.json b/checkpoint-5400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-5400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-5400/tokenizer_config.json b/checkpoint-5400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-5400/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-5400/trainer_state.json b/checkpoint-5400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7c03c77701b49c9fe8ee1dbcd6359f15bab762bd --- /dev/null +++ b/checkpoint-5400/trainer_state.json @@ -0,0 +1,7602 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.982532751091703, + "eval_steps": 100, + "global_step": 5400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + }, + { + "epoch": 0.8560771470160117, + "grad_norm": 0.17895208299160004, + "learning_rate": 2.6965019219562155e-06, + "loss": 0.13971571922302245, + "step": 4705 + }, + { + "epoch": 0.8569868995633187, + "grad_norm": 0.22735828161239624, + "learning_rate": 2.6633455210614055e-06, + "loss": 0.13776102066040039, + "step": 4710 + }, + { + "epoch": 0.8578966521106259, + "grad_norm": 0.16779793798923492, + "learning_rate": 2.630382757333133e-06, + "loss": 0.14134042263031005, + "step": 4715 + }, + { + "epoch": 0.858806404657933, + "grad_norm": 0.2148888260126114, + "learning_rate": 2.597613916526637e-06, + "loss": 0.14680721759796142, + "step": 4720 + }, + { + "epoch": 0.8597161572052402, + "grad_norm": 0.16560257971286774, + "learning_rate": 2.565039282716045e-06, + "loss": 0.14137234687805175, + "step": 4725 + }, + { + "epoch": 0.8606259097525473, + "grad_norm": 0.16197068989276886, + "learning_rate": 2.532659138291879e-06, + "loss": 0.14969314336776735, + "step": 4730 + }, + { + "epoch": 0.8615356622998545, + "grad_norm": 0.14650246500968933, + "learning_rate": 2.5004737639586497e-06, + "loss": 0.13532910346984864, + "step": 4735 + }, + { + "epoch": 0.8624454148471615, + "grad_norm": 0.1565634310245514, + "learning_rate": 2.4684834387323943e-06, + "loss": 0.14146244525909424, + "step": 4740 + }, + { + "epoch": 0.8633551673944687, + "grad_norm": 0.18060864508152008, + "learning_rate": 2.4366884399382393e-06, + "loss": 0.14218534231185914, + "step": 4745 + }, + { + "epoch": 0.8642649199417758, + "grad_norm": 0.24613255262374878, + "learning_rate": 2.4050890432080557e-06, + "loss": 0.13907679319381713, + "step": 4750 + }, + { + "epoch": 0.865174672489083, + "grad_norm": 0.16036023199558258, + "learning_rate": 2.3736855224780057e-06, + "loss": 0.13718113899230958, + "step": 4755 + }, + { + "epoch": 0.86608442503639, + "grad_norm": 0.16678516566753387, + "learning_rate": 2.3424781499862075e-06, + "loss": 0.1327962040901184, + "step": 4760 + }, + { + "epoch": 0.8669941775836972, + "grad_norm": 0.1763770878314972, + "learning_rate": 2.3114671962703727e-06, + "loss": 0.14390318393707274, + "step": 4765 + }, + { + "epoch": 0.8679039301310044, + "grad_norm": 0.17735697329044342, + "learning_rate": 2.280652930165428e-06, + "loss": 0.15223288536071777, + "step": 4770 + }, + { + "epoch": 0.8688136826783115, + "grad_norm": 0.15827041864395142, + "learning_rate": 2.250035618801241e-06, + "loss": 0.14296332597732545, + "step": 4775 + }, + { + "epoch": 0.8697234352256187, + "grad_norm": 0.16876135766506195, + "learning_rate": 2.219615527600244e-06, + "loss": 0.1359076738357544, + "step": 4780 + }, + { + "epoch": 0.8706331877729258, + "grad_norm": 0.1800110638141632, + "learning_rate": 2.189392920275174e-06, + "loss": 0.1424281358718872, + "step": 4785 + }, + { + "epoch": 0.8715429403202329, + "grad_norm": 0.1409560889005661, + "learning_rate": 2.159368058826783e-06, + "loss": 0.14480490684509278, + "step": 4790 + }, + { + "epoch": 0.87245269286754, + "grad_norm": 0.1634288728237152, + "learning_rate": 2.129541203541535e-06, + "loss": 0.14513269662857056, + "step": 4795 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.17126062512397766, + "learning_rate": 2.099912612989391e-06, + "loss": 0.13546934127807617, + "step": 4800 + }, + { + "epoch": 0.8742721979621543, + "grad_norm": 0.16704080998897552, + "learning_rate": 2.0704825440215457e-06, + "loss": 0.13852492570877076, + "step": 4805 + }, + { + "epoch": 0.8751819505094615, + "grad_norm": 0.1725970208644867, + "learning_rate": 2.0412512517681946e-06, + "loss": 0.14504197835922242, + "step": 4810 + }, + { + "epoch": 0.8760917030567685, + "grad_norm": 0.1700201779603958, + "learning_rate": 2.0122189896363387e-06, + "loss": 0.14312338829040527, + "step": 4815 + }, + { + "epoch": 0.8770014556040757, + "grad_norm": 0.16491736471652985, + "learning_rate": 1.9833860093075834e-06, + "loss": 0.14062976837158203, + "step": 4820 + }, + { + "epoch": 0.8779112081513828, + "grad_norm": 0.13748787343502045, + "learning_rate": 1.9547525607359537e-06, + "loss": 0.1346171498298645, + "step": 4825 + }, + { + "epoch": 0.87882096069869, + "grad_norm": 0.16399399936199188, + "learning_rate": 1.926318892145712e-06, + "loss": 0.14178123474121093, + "step": 4830 + }, + { + "epoch": 0.879730713245997, + "grad_norm": 0.14491963386535645, + "learning_rate": 1.8980852500292412e-06, + "loss": 0.1408564567565918, + "step": 4835 + }, + { + "epoch": 0.8806404657933042, + "grad_norm": 0.17335423827171326, + "learning_rate": 1.8700518791448851e-06, + "loss": 0.14403265714645386, + "step": 4840 + }, + { + "epoch": 0.8815502183406113, + "grad_norm": 0.17399625480175018, + "learning_rate": 1.8422190225148155e-06, + "loss": 0.14289036989212037, + "step": 4845 + }, + { + "epoch": 0.8824599708879185, + "grad_norm": 0.17945612967014313, + "learning_rate": 1.814586921422956e-06, + "loss": 0.14494109153747559, + "step": 4850 + }, + { + "epoch": 0.8833697234352256, + "grad_norm": 0.1910620480775833, + "learning_rate": 1.7871558154128664e-06, + "loss": 0.13726245164871215, + "step": 4855 + }, + { + "epoch": 0.8842794759825328, + "grad_norm": 0.1771879345178604, + "learning_rate": 1.7599259422856756e-06, + "loss": 0.1464752197265625, + "step": 4860 + }, + { + "epoch": 0.8851892285298398, + "grad_norm": 0.19427461922168732, + "learning_rate": 1.7328975380980218e-06, + "loss": 0.13823356628417968, + "step": 4865 + }, + { + "epoch": 0.886098981077147, + "grad_norm": 0.1491149365901947, + "learning_rate": 1.7060708371599897e-06, + "loss": 0.1338604211807251, + "step": 4870 + }, + { + "epoch": 0.8870087336244541, + "grad_norm": 0.16087733209133148, + "learning_rate": 1.6794460720331057e-06, + "loss": 0.14184389114379883, + "step": 4875 + }, + { + "epoch": 0.8879184861717613, + "grad_norm": 0.14506325125694275, + "learning_rate": 1.653023473528309e-06, + "loss": 0.14267687797546386, + "step": 4880 + }, + { + "epoch": 0.8888282387190685, + "grad_norm": 0.16886365413665771, + "learning_rate": 1.626803270703936e-06, + "loss": 0.14266083240509034, + "step": 4885 + }, + { + "epoch": 0.8897379912663755, + "grad_norm": 0.1891999989748001, + "learning_rate": 1.6007856908637652e-06, + "loss": 0.1398016929626465, + "step": 4890 + }, + { + "epoch": 0.8906477438136827, + "grad_norm": 0.17645299434661865, + "learning_rate": 1.5749709595550083e-06, + "loss": 0.13869571685791016, + "step": 4895 + }, + { + "epoch": 0.8915574963609898, + "grad_norm": 0.17714262008666992, + "learning_rate": 1.549359300566408e-06, + "loss": 0.14957486391067504, + "step": 4900 + }, + { + "epoch": 0.892467248908297, + "grad_norm": 0.18025240302085876, + "learning_rate": 1.5239509359262355e-06, + "loss": 0.1358652949333191, + "step": 4905 + }, + { + "epoch": 0.8933770014556041, + "grad_norm": 0.17539937794208527, + "learning_rate": 1.4987460859004154e-06, + "loss": 0.13833394050598144, + "step": 4910 + }, + { + "epoch": 0.8942867540029112, + "grad_norm": 0.1772230565547943, + "learning_rate": 1.4737449689905953e-06, + "loss": 0.14202116727828978, + "step": 4915 + }, + { + "epoch": 0.8951965065502183, + "grad_norm": 0.1670161783695221, + "learning_rate": 1.4489478019322433e-06, + "loss": 0.1403665542602539, + "step": 4920 + }, + { + "epoch": 0.8961062590975255, + "grad_norm": 0.1697034239768982, + "learning_rate": 1.4243547996927926e-06, + "loss": 0.1401481032371521, + "step": 4925 + }, + { + "epoch": 0.8970160116448326, + "grad_norm": 0.16474860906600952, + "learning_rate": 1.3999661754697636e-06, + "loss": 0.13969850540161133, + "step": 4930 + }, + { + "epoch": 0.8979257641921398, + "grad_norm": 0.1664883941411972, + "learning_rate": 1.3757821406889027e-06, + "loss": 0.1399069309234619, + "step": 4935 + }, + { + "epoch": 0.8988355167394468, + "grad_norm": 0.16675794124603271, + "learning_rate": 1.351802905002386e-06, + "loss": 0.14129226207733153, + "step": 4940 + }, + { + "epoch": 0.899745269286754, + "grad_norm": 0.17529809474945068, + "learning_rate": 1.3280286762869632e-06, + "loss": 0.14663081169128417, + "step": 4945 + }, + { + "epoch": 0.9006550218340611, + "grad_norm": 0.17758169770240784, + "learning_rate": 1.3044596606421795e-06, + "loss": 0.13986254930496217, + "step": 4950 + }, + { + "epoch": 0.9015647743813683, + "grad_norm": 0.153225839138031, + "learning_rate": 1.2810960623885815e-06, + "loss": 0.14236698150634766, + "step": 4955 + }, + { + "epoch": 0.9024745269286754, + "grad_norm": 0.169761523604393, + "learning_rate": 1.2579380840659376e-06, + "loss": 0.1450445055961609, + "step": 4960 + }, + { + "epoch": 0.9033842794759825, + "grad_norm": 0.16659331321716309, + "learning_rate": 1.2349859264315034e-06, + "loss": 0.14043926000595092, + "step": 4965 + }, + { + "epoch": 0.9042940320232896, + "grad_norm": 0.16748706996440887, + "learning_rate": 1.2122397884582553e-06, + "loss": 0.14725675582885742, + "step": 4970 + }, + { + "epoch": 0.9052037845705968, + "grad_norm": 0.1600511223077774, + "learning_rate": 1.1896998673331883e-06, + "loss": 0.14551150798797607, + "step": 4975 + }, + { + "epoch": 0.9061135371179039, + "grad_norm": 0.24318362772464752, + "learning_rate": 1.1673663584555934e-06, + "loss": 0.14470888376235963, + "step": 4980 + }, + { + "epoch": 0.9070232896652111, + "grad_norm": 0.16443821787834167, + "learning_rate": 1.1452394554353706e-06, + "loss": 0.13639854192733764, + "step": 4985 + }, + { + "epoch": 0.9079330422125182, + "grad_norm": 0.14277774095535278, + "learning_rate": 1.1233193500913453e-06, + "loss": 0.13749881982803344, + "step": 4990 + }, + { + "epoch": 0.9088427947598253, + "grad_norm": 0.1610947549343109, + "learning_rate": 1.1016062324496008e-06, + "loss": 0.1385629653930664, + "step": 4995 + }, + { + "epoch": 0.9097525473071325, + "grad_norm": 0.17888498306274414, + "learning_rate": 1.080100290741845e-06, + "loss": 0.14225621223449708, + "step": 5000 + }, + { + "epoch": 0.9106622998544396, + "grad_norm": 0.17488449811935425, + "learning_rate": 1.0588017114037729e-06, + "loss": 0.14187805652618407, + "step": 5005 + }, + { + "epoch": 0.9115720524017468, + "grad_norm": 0.16410665214061737, + "learning_rate": 1.0377106790734392e-06, + "loss": 0.1407416582107544, + "step": 5010 + }, + { + "epoch": 0.9124818049490538, + "grad_norm": 0.18115971982479095, + "learning_rate": 1.016827376589674e-06, + "loss": 0.1427263855934143, + "step": 5015 + }, + { + "epoch": 0.913391557496361, + "grad_norm": 0.18507841229438782, + "learning_rate": 9.961519849904898e-07, + "loss": 0.1390499472618103, + "step": 5020 + }, + { + "epoch": 0.9143013100436681, + "grad_norm": 0.21296796202659607, + "learning_rate": 9.75684683511513e-07, + "loss": 0.1382216691970825, + "step": 5025 + }, + { + "epoch": 0.9152110625909753, + "grad_norm": 0.2308044582605362, + "learning_rate": 9.55425649584435e-07, + "loss": 0.14271280765533448, + "step": 5030 + }, + { + "epoch": 0.9161208151382824, + "grad_norm": 0.15796682238578796, + "learning_rate": 9.353750588354527e-07, + "loss": 0.13807624578475952, + "step": 5035 + }, + { + "epoch": 0.9170305676855895, + "grad_norm": 0.1695316582918167, + "learning_rate": 9.155330850837834e-07, + "loss": 0.14289476871490478, + "step": 5040 + }, + { + "epoch": 0.9179403202328966, + "grad_norm": 0.1738404780626297, + "learning_rate": 8.958999003401191e-07, + "loss": 0.14070619344711305, + "step": 5045 + }, + { + "epoch": 0.9188500727802038, + "grad_norm": 0.20618964731693268, + "learning_rate": 8.764756748051662e-07, + "loss": 0.14535053968429565, + "step": 5050 + }, + { + "epoch": 0.9197598253275109, + "grad_norm": 0.1506137251853943, + "learning_rate": 8.572605768681546e-07, + "loss": 0.13995139598846434, + "step": 5055 + }, + { + "epoch": 0.9206695778748181, + "grad_norm": 0.17772039771080017, + "learning_rate": 8.382547731053708e-07, + "loss": 0.14470311403274536, + "step": 5060 + }, + { + "epoch": 0.9215793304221251, + "grad_norm": 0.19897456467151642, + "learning_rate": 8.194584282787382e-07, + "loss": 0.144488525390625, + "step": 5065 + }, + { + "epoch": 0.9224890829694323, + "grad_norm": 0.15899236500263214, + "learning_rate": 8.008717053343606e-07, + "loss": 0.1352991580963135, + "step": 5070 + }, + { + "epoch": 0.9233988355167394, + "grad_norm": 0.14965768158435822, + "learning_rate": 7.824947654011345e-07, + "loss": 0.13827911615371705, + "step": 5075 + }, + { + "epoch": 0.9243085880640466, + "grad_norm": 0.43651485443115234, + "learning_rate": 7.643277677893329e-07, + "loss": 0.14149526357650757, + "step": 5080 + }, + { + "epoch": 0.9252183406113537, + "grad_norm": 0.19912713766098022, + "learning_rate": 7.463708699892325e-07, + "loss": 0.14357032775878906, + "step": 5085 + }, + { + "epoch": 0.9261280931586608, + "grad_norm": 0.1635904610157013, + "learning_rate": 7.286242276697524e-07, + "loss": 0.13550699949264527, + "step": 5090 + }, + { + "epoch": 0.9270378457059679, + "grad_norm": 0.19391080737113953, + "learning_rate": 7.11087994677101e-07, + "loss": 0.14674756526947022, + "step": 5095 + }, + { + "epoch": 0.9279475982532751, + "grad_norm": 0.17458125948905945, + "learning_rate": 6.937623230334284e-07, + "loss": 0.14155579805374147, + "step": 5100 + }, + { + "epoch": 0.9288573508005823, + "grad_norm": 0.1617971807718277, + "learning_rate": 6.766473629355452e-07, + "loss": 0.140555477142334, + "step": 5105 + }, + { + "epoch": 0.9297671033478894, + "grad_norm": 0.16945427656173706, + "learning_rate": 6.59743262753576e-07, + "loss": 0.13607511520385743, + "step": 5110 + }, + { + "epoch": 0.9306768558951966, + "grad_norm": 0.18347840011119843, + "learning_rate": 6.43050169029702e-07, + "loss": 0.14903461933135986, + "step": 5115 + }, + { + "epoch": 0.9315866084425036, + "grad_norm": 0.15434837341308594, + "learning_rate": 6.265682264768869e-07, + "loss": 0.14146015644073487, + "step": 5120 + }, + { + "epoch": 0.9324963609898108, + "grad_norm": 0.1397712528705597, + "learning_rate": 6.10297577977606e-07, + "loss": 0.14261592626571656, + "step": 5125 + }, + { + "epoch": 0.9334061135371179, + "grad_norm": 0.1765873283147812, + "learning_rate": 5.942383645826361e-07, + "loss": 0.13559447526931762, + "step": 5130 + }, + { + "epoch": 0.9343158660844251, + "grad_norm": 0.1656057983636856, + "learning_rate": 5.783907255098003e-07, + "loss": 0.13961490392684936, + "step": 5135 + }, + { + "epoch": 0.9352256186317321, + "grad_norm": 0.2169366180896759, + "learning_rate": 5.627547981427894e-07, + "loss": 0.1447835922241211, + "step": 5140 + }, + { + "epoch": 0.9361353711790393, + "grad_norm": 0.18623125553131104, + "learning_rate": 5.473307180299508e-07, + "loss": 0.14366730451583862, + "step": 5145 + }, + { + "epoch": 0.9370451237263464, + "grad_norm": 0.15423963963985443, + "learning_rate": 5.32118618883129e-07, + "loss": 0.14295632839202882, + "step": 5150 + }, + { + "epoch": 0.9379548762736536, + "grad_norm": 0.18423247337341309, + "learning_rate": 5.17118632576491e-07, + "loss": 0.14137414693832398, + "step": 5155 + }, + { + "epoch": 0.9388646288209607, + "grad_norm": 0.15338757634162903, + "learning_rate": 5.023308891453915e-07, + "loss": 0.13583066463470458, + "step": 5160 + }, + { + "epoch": 0.9397743813682679, + "grad_norm": 0.2293633222579956, + "learning_rate": 4.877555167852515e-07, + "loss": 0.14819620847702025, + "step": 5165 + }, + { + "epoch": 0.9406841339155749, + "grad_norm": 0.16889944672584534, + "learning_rate": 4.7339264185043974e-07, + "loss": 0.13617686033248902, + "step": 5170 + }, + { + "epoch": 0.9415938864628821, + "grad_norm": 0.1767464578151703, + "learning_rate": 4.5924238885316775e-07, + "loss": 0.13487552404403685, + "step": 5175 + }, + { + "epoch": 0.9425036390101892, + "grad_norm": 0.16697899997234344, + "learning_rate": 4.453048804624327e-07, + "loss": 0.1446886420249939, + "step": 5180 + }, + { + "epoch": 0.9434133915574964, + "grad_norm": 0.19576266407966614, + "learning_rate": 4.315802375029293e-07, + "loss": 0.14252450466156005, + "step": 5185 + }, + { + "epoch": 0.9443231441048034, + "grad_norm": 0.14838077127933502, + "learning_rate": 4.18068578954034e-07, + "loss": 0.13933032751083374, + "step": 5190 + }, + { + "epoch": 0.9452328966521106, + "grad_norm": 0.18481744825839996, + "learning_rate": 4.047700219487388e-07, + "loss": 0.1410665273666382, + "step": 5195 + }, + { + "epoch": 0.9461426491994177, + "grad_norm": 0.16954176127910614, + "learning_rate": 3.9168468177265547e-07, + "loss": 0.1421758770942688, + "step": 5200 + }, + { + "epoch": 0.9470524017467249, + "grad_norm": 0.17614421248435974, + "learning_rate": 3.7881267186301306e-07, + "loss": 0.14059911966323851, + "step": 5205 + }, + { + "epoch": 0.9479621542940321, + "grad_norm": 0.1637226939201355, + "learning_rate": 3.6615410380767544e-07, + "loss": 0.1360395908355713, + "step": 5210 + }, + { + "epoch": 0.9488719068413392, + "grad_norm": 0.18330250680446625, + "learning_rate": 3.5370908734417006e-07, + "loss": 0.14543824195861815, + "step": 5215 + }, + { + "epoch": 0.9497816593886463, + "grad_norm": 0.1895420402288437, + "learning_rate": 3.414777303587413e-07, + "loss": 0.15304578542709352, + "step": 5220 + }, + { + "epoch": 0.9506914119359534, + "grad_norm": 0.15384933352470398, + "learning_rate": 3.294601388854041e-07, + "loss": 0.14675912857055665, + "step": 5225 + }, + { + "epoch": 0.9516011644832606, + "grad_norm": 0.20188499987125397, + "learning_rate": 3.1765641710505e-07, + "loss": 0.14068362712860108, + "step": 5230 + }, + { + "epoch": 0.9525109170305677, + "grad_norm": 0.16467279195785522, + "learning_rate": 3.060666673445123e-07, + "loss": 0.14733167886734008, + "step": 5235 + }, + { + "epoch": 0.9534206695778749, + "grad_norm": 0.16632016003131866, + "learning_rate": 2.9469099007569943e-07, + "loss": 0.13753929138183593, + "step": 5240 + }, + { + "epoch": 0.9543304221251819, + "grad_norm": 0.1477566957473755, + "learning_rate": 2.83529483914724e-07, + "loss": 0.14354891777038575, + "step": 5245 + }, + { + "epoch": 0.9552401746724891, + "grad_norm": 0.1693645417690277, + "learning_rate": 2.7258224562102805e-07, + "loss": 0.14622807502746582, + "step": 5250 + }, + { + "epoch": 0.9561499272197962, + "grad_norm": 0.17574062943458557, + "learning_rate": 2.6184937009657295e-07, + "loss": 0.1344899296760559, + "step": 5255 + }, + { + "epoch": 0.9570596797671034, + "grad_norm": 0.17448563873767853, + "learning_rate": 2.513309503850009e-07, + "loss": 0.1355789542198181, + "step": 5260 + }, + { + "epoch": 0.9579694323144105, + "grad_norm": 0.16993778944015503, + "learning_rate": 2.41027077670819e-07, + "loss": 0.151595401763916, + "step": 5265 + }, + { + "epoch": 0.9588791848617176, + "grad_norm": 0.16944102942943573, + "learning_rate": 2.3093784127863062e-07, + "loss": 0.1466623306274414, + "step": 5270 + }, + { + "epoch": 0.9597889374090247, + "grad_norm": 0.18085163831710815, + "learning_rate": 2.2106332867234402e-07, + "loss": 0.14645814895629883, + "step": 5275 + }, + { + "epoch": 0.9606986899563319, + "grad_norm": 0.14682307839393616, + "learning_rate": 2.1140362545442605e-07, + "loss": 0.13901774883270263, + "step": 5280 + }, + { + "epoch": 0.961608442503639, + "grad_norm": 0.17189526557922363, + "learning_rate": 2.0195881536514694e-07, + "loss": 0.14153491258621215, + "step": 5285 + }, + { + "epoch": 0.9625181950509462, + "grad_norm": 0.1977207362651825, + "learning_rate": 1.9272898028186714e-07, + "loss": 0.1437437653541565, + "step": 5290 + }, + { + "epoch": 0.9634279475982532, + "grad_norm": 0.16637668013572693, + "learning_rate": 1.837142002183184e-07, + "loss": 0.13910138607025146, + "step": 5295 + }, + { + "epoch": 0.9643377001455604, + "grad_norm": 0.18155774474143982, + "learning_rate": 1.7491455332391548e-07, + "loss": 0.14177814722061158, + "step": 5300 + }, + { + "epoch": 0.9652474526928675, + "grad_norm": 0.32478174567222595, + "learning_rate": 1.6633011588307878e-07, + "loss": 0.14292703866958617, + "step": 5305 + }, + { + "epoch": 0.9661572052401747, + "grad_norm": 0.18050940334796906, + "learning_rate": 1.5796096231456558e-07, + "loss": 0.13252723217010498, + "step": 5310 + }, + { + "epoch": 0.9670669577874818, + "grad_norm": 0.15919657051563263, + "learning_rate": 1.4980716517083715e-07, + "loss": 0.14491976499557496, + "step": 5315 + }, + { + "epoch": 0.9679767103347889, + "grad_norm": 0.15895310044288635, + "learning_rate": 1.4186879513741758e-07, + "loss": 0.13617006540298462, + "step": 5320 + }, + { + "epoch": 0.9688864628820961, + "grad_norm": 0.1543736606836319, + "learning_rate": 1.3414592103228595e-07, + "loss": 0.14220429658889772, + "step": 5325 + }, + { + "epoch": 0.9697962154294032, + "grad_norm": 0.16660647094249725, + "learning_rate": 1.2663860980528797e-07, + "loss": 0.14069980382919312, + "step": 5330 + }, + { + "epoch": 0.9707059679767104, + "grad_norm": 0.15238550305366516, + "learning_rate": 1.1934692653754186e-07, + "loss": 0.13978019952774048, + "step": 5335 + }, + { + "epoch": 0.9716157205240175, + "grad_norm": 0.1649473011493683, + "learning_rate": 1.1227093444088066e-07, + "loss": 0.1401435136795044, + "step": 5340 + }, + { + "epoch": 0.9725254730713246, + "grad_norm": 0.14920124411582947, + "learning_rate": 1.0541069485730249e-07, + "loss": 0.13952178955078126, + "step": 5345 + }, + { + "epoch": 0.9734352256186317, + "grad_norm": 0.16802479326725006, + "learning_rate": 9.876626725844329e-08, + "loss": 0.14808181524276734, + "step": 5350 + }, + { + "epoch": 0.9743449781659389, + "grad_norm": 0.18096603453159332, + "learning_rate": 9.233770924505781e-08, + "loss": 0.13938647508621216, + "step": 5355 + }, + { + "epoch": 0.975254730713246, + "grad_norm": 0.1658579558134079, + "learning_rate": 8.612507654651991e-08, + "loss": 0.14219754934310913, + "step": 5360 + }, + { + "epoch": 0.9761644832605532, + "grad_norm": 0.1547713279724121, + "learning_rate": 8.012842302033696e-08, + "loss": 0.14298388957977295, + "step": 5365 + }, + { + "epoch": 0.9770742358078602, + "grad_norm": 0.18247587978839874, + "learning_rate": 7.434780065169178e-08, + "loss": 0.14103788137435913, + "step": 5370 + }, + { + "epoch": 0.9779839883551674, + "grad_norm": 0.17593605816364288, + "learning_rate": 6.878325955297915e-08, + "loss": 0.1450013041496277, + "step": 5375 + }, + { + "epoch": 0.9788937409024745, + "grad_norm": 0.17178039252758026, + "learning_rate": 6.343484796338395e-08, + "loss": 0.14021269083023072, + "step": 5380 + }, + { + "epoch": 0.9798034934497817, + "grad_norm": 0.17904147505760193, + "learning_rate": 5.830261224845923e-08, + "loss": 0.1460060477256775, + "step": 5385 + }, + { + "epoch": 0.9807132459970888, + "grad_norm": 0.16323266923427582, + "learning_rate": 5.338659689971548e-08, + "loss": 0.13915741443634033, + "step": 5390 + }, + { + "epoch": 0.9816229985443959, + "grad_norm": 0.1829039305448532, + "learning_rate": 4.8686844534248655e-08, + "loss": 0.1372266888618469, + "step": 5395 + }, + { + "epoch": 0.982532751091703, + "grad_norm": 0.16742415726184845, + "learning_rate": 4.420339589435995e-08, + "loss": 0.14404670000076295, + "step": 5400 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.967782995976911e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5400/training_args.bin b/checkpoint-5400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-5400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-5500/README.md b/checkpoint-5500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-5500/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-5500/adapter_config.json b/checkpoint-5500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-5500/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-5500/adapter_model.safetensors b/checkpoint-5500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13e7eccf51f5d9d11e1fc349773e81db85eac36b --- /dev/null +++ b/checkpoint-5500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:023fcb9c596c99c5e8d74320f9720621834918ec3bcd5d877b44b0fe0907ce2e +size 169741912 diff --git a/checkpoint-5500/chat_template.jinja b/checkpoint-5500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-5500/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-5500/optimizer.pt b/checkpoint-5500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3c813f7ee054bd6cc17032b68e0ee8e23f652ff --- /dev/null +++ b/checkpoint-5500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a943077e29417c2f64c4e35a6044a31f0885d613fe3e295a70b06474feaca5da +size 72807355 diff --git a/checkpoint-5500/processor_config.json b/checkpoint-5500/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-5500/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-5500/rng_state.pth b/checkpoint-5500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bd3c190a62ba12f21428f29e0f4bde711034a75b --- /dev/null +++ b/checkpoint-5500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a9f217e852f439efa6bd32fde98d6867f11aa6ea13ddc021ba10af6a0b0934 +size 14645 diff --git a/checkpoint-5500/scheduler.pt b/checkpoint-5500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..110cd65327bf1c41e27ad865b4a927c404da750c --- /dev/null +++ b/checkpoint-5500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbfcd72a0f469e954d7ef4acd4596e1e654cea8ecfeeb0bc5b3be32d628eac2c +size 1465 diff --git a/checkpoint-5500/tokenizer.json b/checkpoint-5500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-5500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-5500/tokenizer_config.json b/checkpoint-5500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-5500/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-5500/trainer_state.json b/checkpoint-5500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..14b1ef0421dcf74a09f0cd31dddcd67f0043815f --- /dev/null +++ b/checkpoint-5500/trainer_state.json @@ -0,0 +1,7742 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0007278020378456, + "eval_steps": 100, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + }, + { + "epoch": 0.16466521106259097, + "grad_norm": 0.16048531234264374, + "learning_rate": 4.7669949816846173e-05, + "loss": 0.18330031633377075, + "step": 905 + }, + { + "epoch": 0.1655749636098981, + "grad_norm": 0.1440177708864212, + "learning_rate": 4.7638820979495534e-05, + "loss": 0.17712442874908446, + "step": 910 + }, + { + "epoch": 0.16648471615720525, + "grad_norm": 0.19635969400405884, + "learning_rate": 4.760749588548738e-05, + "loss": 0.18679027557373046, + "step": 915 + }, + { + "epoch": 0.16739446870451238, + "grad_norm": 0.15576541423797607, + "learning_rate": 4.757597480637995e-05, + "loss": 0.19283764362335204, + "step": 920 + }, + { + "epoch": 0.1683042212518195, + "grad_norm": 0.1550331562757492, + "learning_rate": 4.7544258015430463e-05, + "loss": 0.18269542455673218, + "step": 925 + }, + { + "epoch": 0.16921397379912664, + "grad_norm": 0.18369626998901367, + "learning_rate": 4.75123457875928e-05, + "loss": 0.1697891116142273, + "step": 930 + }, + { + "epoch": 0.17012372634643377, + "grad_norm": 0.15266314148902893, + "learning_rate": 4.7480238399515074e-05, + "loss": 0.18523451089859008, + "step": 935 + }, + { + "epoch": 0.1710334788937409, + "grad_norm": 0.16709664463996887, + "learning_rate": 4.744793612953724e-05, + "loss": 0.1803238034248352, + "step": 940 + }, + { + "epoch": 0.17194323144104803, + "grad_norm": 0.14929179847240448, + "learning_rate": 4.741543925768872e-05, + "loss": 0.1861217737197876, + "step": 945 + }, + { + "epoch": 0.17285298398835516, + "grad_norm": 0.1362280696630478, + "learning_rate": 4.7382748065685915e-05, + "loss": 0.17896100282669067, + "step": 950 + }, + { + "epoch": 0.1737627365356623, + "grad_norm": 0.15290239453315735, + "learning_rate": 4.734986283692982e-05, + "loss": 0.18432788848876952, + "step": 955 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.1287035197019577, + "learning_rate": 4.73167838565035e-05, + "loss": 0.18485682010650634, + "step": 960 + }, + { + "epoch": 0.17558224163027655, + "grad_norm": 0.17969627678394318, + "learning_rate": 4.728351141116971e-05, + "loss": 0.17361557483673096, + "step": 965 + }, + { + "epoch": 0.1764919941775837, + "grad_norm": 0.13751201331615448, + "learning_rate": 4.7250045789368326e-05, + "loss": 0.1731679320335388, + "step": 970 + }, + { + "epoch": 0.17740174672489084, + "grad_norm": 0.1603265255689621, + "learning_rate": 4.721638728121388e-05, + "loss": 0.17308170795440675, + "step": 975 + }, + { + "epoch": 0.17831149927219797, + "grad_norm": 0.1592789888381958, + "learning_rate": 4.718253617849306e-05, + "loss": 0.17534757852554322, + "step": 980 + }, + { + "epoch": 0.1792212518195051, + "grad_norm": 0.12727224826812744, + "learning_rate": 4.714849277466214e-05, + "loss": 0.17817609310150145, + "step": 985 + }, + { + "epoch": 0.18013100436681223, + "grad_norm": 0.15401554107666016, + "learning_rate": 4.711425736484447e-05, + "loss": 0.1733405351638794, + "step": 990 + }, + { + "epoch": 0.18104075691411936, + "grad_norm": 0.13253968954086304, + "learning_rate": 4.7079830245827906e-05, + "loss": 0.17846795320510864, + "step": 995 + }, + { + "epoch": 0.1819505094614265, + "grad_norm": 0.21846213936805725, + "learning_rate": 4.7045211716062245e-05, + "loss": 0.18021599054336548, + "step": 1000 + }, + { + "epoch": 0.18286026200873362, + "grad_norm": 0.16867990791797638, + "learning_rate": 4.7010402075656595e-05, + "loss": 0.18232386112213134, + "step": 1005 + }, + { + "epoch": 0.18377001455604075, + "grad_norm": 0.17180582880973816, + "learning_rate": 4.697540162637686e-05, + "loss": 0.1816317319869995, + "step": 1010 + }, + { + "epoch": 0.18467976710334788, + "grad_norm": 0.16480213403701782, + "learning_rate": 4.694021067164303e-05, + "loss": 0.17718446254730225, + "step": 1015 + }, + { + "epoch": 0.185589519650655, + "grad_norm": 0.15015918016433716, + "learning_rate": 4.6904829516526605e-05, + "loss": 0.17412011623382567, + "step": 1020 + }, + { + "epoch": 0.18649927219796217, + "grad_norm": 0.14445139467716217, + "learning_rate": 4.686925846774795e-05, + "loss": 0.1778018832206726, + "step": 1025 + }, + { + "epoch": 0.1874090247452693, + "grad_norm": 0.1701960265636444, + "learning_rate": 4.683349783367362e-05, + "loss": 0.16901081800460815, + "step": 1030 + }, + { + "epoch": 0.18831877729257643, + "grad_norm": 0.15894867479801178, + "learning_rate": 4.679754792431368e-05, + "loss": 0.17055928707122803, + "step": 1035 + }, + { + "epoch": 0.18922852983988356, + "grad_norm": 0.1511942446231842, + "learning_rate": 4.676140905131903e-05, + "loss": 0.17339680194854737, + "step": 1040 + }, + { + "epoch": 0.1901382823871907, + "grad_norm": 0.14735209941864014, + "learning_rate": 4.672508152797872e-05, + "loss": 0.17802717685699462, + "step": 1045 + }, + { + "epoch": 0.19104803493449782, + "grad_norm": 0.17367291450500488, + "learning_rate": 4.66885656692172e-05, + "loss": 0.1732744097709656, + "step": 1050 + }, + { + "epoch": 0.19195778748180495, + "grad_norm": 0.147227481007576, + "learning_rate": 4.665186179159159e-05, + "loss": 0.17040517330169677, + "step": 1055 + }, + { + "epoch": 0.19286754002911208, + "grad_norm": 0.1709655076265335, + "learning_rate": 4.6614970213289e-05, + "loss": 0.17794088125228882, + "step": 1060 + }, + { + "epoch": 0.1937772925764192, + "grad_norm": 0.1588088721036911, + "learning_rate": 4.657789125412366e-05, + "loss": 0.17180380821228028, + "step": 1065 + }, + { + "epoch": 0.19468704512372634, + "grad_norm": 0.14827021956443787, + "learning_rate": 4.654062523553428e-05, + "loss": 0.182997989654541, + "step": 1070 + }, + { + "epoch": 0.19559679767103347, + "grad_norm": 0.16230466961860657, + "learning_rate": 4.6503172480581126e-05, + "loss": 0.17346880435943604, + "step": 1075 + }, + { + "epoch": 0.1965065502183406, + "grad_norm": 0.1637624353170395, + "learning_rate": 4.646553331394333e-05, + "loss": 0.17263576984405518, + "step": 1080 + }, + { + "epoch": 0.19741630276564776, + "grad_norm": 0.15977843105793, + "learning_rate": 4.642770806191603e-05, + "loss": 0.17284308671951293, + "step": 1085 + }, + { + "epoch": 0.19832605531295489, + "grad_norm": 0.15394869446754456, + "learning_rate": 4.6389697052407534e-05, + "loss": 0.17797101736068727, + "step": 1090 + }, + { + "epoch": 0.19923580786026202, + "grad_norm": 0.15995225310325623, + "learning_rate": 4.6351500614936485e-05, + "loss": 0.18137198686599731, + "step": 1095 + }, + { + "epoch": 0.20014556040756915, + "grad_norm": 0.1779479682445526, + "learning_rate": 4.6313119080629006e-05, + "loss": 0.17998344898223878, + "step": 1100 + }, + { + "epoch": 0.20105531295487628, + "grad_norm": 0.14362832903862, + "learning_rate": 4.627455278221584e-05, + "loss": 0.18196423053741456, + "step": 1105 + }, + { + "epoch": 0.2019650655021834, + "grad_norm": 0.15951639413833618, + "learning_rate": 4.623580205402947e-05, + "loss": 0.17423888444900512, + "step": 1110 + }, + { + "epoch": 0.20287481804949054, + "grad_norm": 0.17273563146591187, + "learning_rate": 4.619686723200115e-05, + "loss": 0.17392473220825194, + "step": 1115 + }, + { + "epoch": 0.20378457059679767, + "grad_norm": 0.1655360758304596, + "learning_rate": 4.615774865365813e-05, + "loss": 0.17528389692306517, + "step": 1120 + }, + { + "epoch": 0.2046943231441048, + "grad_norm": 0.15920691192150116, + "learning_rate": 4.611844665812058e-05, + "loss": 0.1806849241256714, + "step": 1125 + }, + { + "epoch": 0.20560407569141192, + "grad_norm": 0.16114577651023865, + "learning_rate": 4.607896158609875e-05, + "loss": 0.17217352390289306, + "step": 1130 + }, + { + "epoch": 0.20651382823871905, + "grad_norm": 0.1499422937631607, + "learning_rate": 4.603929377988999e-05, + "loss": 0.17806737422943114, + "step": 1135 + }, + { + "epoch": 0.2074235807860262, + "grad_norm": 0.17605191469192505, + "learning_rate": 4.5999443583375765e-05, + "loss": 0.17842113971710205, + "step": 1140 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.16117210686206818, + "learning_rate": 4.595941134201871e-05, + "loss": 0.18379683494567872, + "step": 1145 + }, + { + "epoch": 0.20924308588064047, + "grad_norm": 0.21199050545692444, + "learning_rate": 4.591919740285957e-05, + "loss": 0.16286123991012574, + "step": 1150 + }, + { + "epoch": 0.2101528384279476, + "grad_norm": 0.15100529789924622, + "learning_rate": 4.587880211451427e-05, + "loss": 0.17995200157165528, + "step": 1155 + }, + { + "epoch": 0.21106259097525473, + "grad_norm": 0.16618172824382782, + "learning_rate": 4.583822582717085e-05, + "loss": 0.16960303783416747, + "step": 1160 + }, + { + "epoch": 0.21197234352256186, + "grad_norm": 0.14743569493293762, + "learning_rate": 4.579746889258643e-05, + "loss": 0.17762668132781984, + "step": 1165 + }, + { + "epoch": 0.212882096069869, + "grad_norm": 0.1697179079055786, + "learning_rate": 4.575653166408417e-05, + "loss": 0.16656005382537842, + "step": 1170 + }, + { + "epoch": 0.21379184861717612, + "grad_norm": 0.14886513352394104, + "learning_rate": 4.57154144965502e-05, + "loss": 0.17091882228851318, + "step": 1175 + }, + { + "epoch": 0.21470160116448325, + "grad_norm": 0.18197473883628845, + "learning_rate": 4.5674117746430556e-05, + "loss": 0.1770920753479004, + "step": 1180 + }, + { + "epoch": 0.21561135371179038, + "grad_norm": 0.17323088645935059, + "learning_rate": 4.563264177172807e-05, + "loss": 0.1734643578529358, + "step": 1185 + }, + { + "epoch": 0.2165211062590975, + "grad_norm": 0.1521984338760376, + "learning_rate": 4.559098693199929e-05, + "loss": 0.17515116930007935, + "step": 1190 + }, + { + "epoch": 0.21743085880640467, + "grad_norm": 0.1842304915189743, + "learning_rate": 4.554915358835134e-05, + "loss": 0.16798022985458375, + "step": 1195 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.14753451943397522, + "learning_rate": 4.5507142103438794e-05, + "loss": 0.1755476713180542, + "step": 1200 + }, + { + "epoch": 0.21925036390101893, + "grad_norm": 0.17096194624900818, + "learning_rate": 4.546495284146057e-05, + "loss": 0.1792473554611206, + "step": 1205 + }, + { + "epoch": 0.22016011644832606, + "grad_norm": 0.1579233556985855, + "learning_rate": 4.542258616815669e-05, + "loss": 0.17230144739151002, + "step": 1210 + }, + { + "epoch": 0.2210698689956332, + "grad_norm": 0.177297905087471, + "learning_rate": 4.5380042450805216e-05, + "loss": 0.1807127833366394, + "step": 1215 + }, + { + "epoch": 0.22197962154294032, + "grad_norm": 0.14331696927547455, + "learning_rate": 4.533732205821897e-05, + "loss": 0.17201389074325563, + "step": 1220 + }, + { + "epoch": 0.22288937409024745, + "grad_norm": 0.14473360776901245, + "learning_rate": 4.529442536074239e-05, + "loss": 0.17036900520324708, + "step": 1225 + }, + { + "epoch": 0.22379912663755458, + "grad_norm": 0.1820901483297348, + "learning_rate": 4.5251352730248314e-05, + "loss": 0.17704882621765136, + "step": 1230 + }, + { + "epoch": 0.2247088791848617, + "grad_norm": 0.1948976367712021, + "learning_rate": 4.5208104540134746e-05, + "loss": 0.1706973433494568, + "step": 1235 + }, + { + "epoch": 0.22561863173216884, + "grad_norm": 0.16660070419311523, + "learning_rate": 4.51646811653216e-05, + "loss": 0.17636821269989014, + "step": 1240 + }, + { + "epoch": 0.22652838427947597, + "grad_norm": 0.1699984073638916, + "learning_rate": 4.512108298224751e-05, + "loss": 0.16986632347106934, + "step": 1245 + }, + { + "epoch": 0.22743813682678313, + "grad_norm": 0.17601042985916138, + "learning_rate": 4.50773103688665e-05, + "loss": 0.17507898807525635, + "step": 1250 + }, + { + "epoch": 0.22834788937409026, + "grad_norm": 0.17557238042354584, + "learning_rate": 4.503336370464476e-05, + "loss": 0.17702863216400147, + "step": 1255 + }, + { + "epoch": 0.2292576419213974, + "grad_norm": 0.1800651252269745, + "learning_rate": 4.498924337055729e-05, + "loss": 0.16419180631637573, + "step": 1260 + }, + { + "epoch": 0.23016739446870452, + "grad_norm": 0.2022479772567749, + "learning_rate": 4.494494974908468e-05, + "loss": 0.17482060194015503, + "step": 1265 + }, + { + "epoch": 0.23107714701601165, + "grad_norm": 0.14180205762386322, + "learning_rate": 4.490048322420973e-05, + "loss": 0.1723136067390442, + "step": 1270 + }, + { + "epoch": 0.23198689956331878, + "grad_norm": 0.18607310950756073, + "learning_rate": 4.485584418141419e-05, + "loss": 0.17096419334411622, + "step": 1275 + }, + { + "epoch": 0.2328966521106259, + "grad_norm": 0.15958310663700104, + "learning_rate": 4.481103300767529e-05, + "loss": 0.1656244158744812, + "step": 1280 + }, + { + "epoch": 0.23380640465793304, + "grad_norm": 0.17552383244037628, + "learning_rate": 4.476605009146255e-05, + "loss": 0.17677626609802247, + "step": 1285 + }, + { + "epoch": 0.23471615720524017, + "grad_norm": 0.15299823880195618, + "learning_rate": 4.472089582273429e-05, + "loss": 0.1778991103172302, + "step": 1290 + }, + { + "epoch": 0.2356259097525473, + "grad_norm": 0.14613987505435944, + "learning_rate": 4.46755705929343e-05, + "loss": 0.17071452140808105, + "step": 1295 + }, + { + "epoch": 0.23653566229985443, + "grad_norm": 0.17781122028827667, + "learning_rate": 4.463007479498843e-05, + "loss": 0.16955430507659913, + "step": 1300 + }, + { + "epoch": 0.23744541484716158, + "grad_norm": 0.16326487064361572, + "learning_rate": 4.458440882330119e-05, + "loss": 0.1777693510055542, + "step": 1305 + }, + { + "epoch": 0.23835516739446871, + "grad_norm": 0.17701926827430725, + "learning_rate": 4.4538573073752365e-05, + "loss": 0.16323351860046387, + "step": 1310 + }, + { + "epoch": 0.23926491994177584, + "grad_norm": 0.13104717433452606, + "learning_rate": 4.449256794369349e-05, + "loss": 0.17653456926345826, + "step": 1315 + }, + { + "epoch": 0.24017467248908297, + "grad_norm": 0.1796836256980896, + "learning_rate": 4.444639383194452e-05, + "loss": 0.17189600467681884, + "step": 1320 + }, + { + "epoch": 0.2410844250363901, + "grad_norm": 0.14919696748256683, + "learning_rate": 4.440005113879029e-05, + "loss": 0.17003334760665895, + "step": 1325 + }, + { + "epoch": 0.24199417758369723, + "grad_norm": 0.1728784441947937, + "learning_rate": 4.4353540265977064e-05, + "loss": 0.17397408485412597, + "step": 1330 + }, + { + "epoch": 0.24290393013100436, + "grad_norm": 0.14591015875339508, + "learning_rate": 4.43068616167091e-05, + "loss": 0.16498478651046752, + "step": 1335 + }, + { + "epoch": 0.2438136826783115, + "grad_norm": 0.18417201936244965, + "learning_rate": 4.4260015595645055e-05, + "loss": 0.16841750144958495, + "step": 1340 + }, + { + "epoch": 0.24472343522561862, + "grad_norm": 0.16264279186725616, + "learning_rate": 4.4213002608894605e-05, + "loss": 0.16907373666763306, + "step": 1345 + }, + { + "epoch": 0.24563318777292575, + "grad_norm": 0.15248481929302216, + "learning_rate": 4.416582306401481e-05, + "loss": 0.15931472778320313, + "step": 1350 + }, + { + "epoch": 0.24654294032023288, + "grad_norm": 0.1488373875617981, + "learning_rate": 4.4118477370006636e-05, + "loss": 0.1701716423034668, + "step": 1355 + }, + { + "epoch": 0.24745269286754004, + "grad_norm": 0.14679782092571259, + "learning_rate": 4.407096593731142e-05, + "loss": 0.157412326335907, + "step": 1360 + }, + { + "epoch": 0.24836244541484717, + "grad_norm": 0.17139530181884766, + "learning_rate": 4.402328917780728e-05, + "loss": 0.17303754091262818, + "step": 1365 + }, + { + "epoch": 0.2492721979621543, + "grad_norm": 0.1534871757030487, + "learning_rate": 4.397544750480554e-05, + "loss": 0.1786255121231079, + "step": 1370 + }, + { + "epoch": 0.2501819505094614, + "grad_norm": 0.1876252293586731, + "learning_rate": 4.39274413330472e-05, + "loss": 0.16442898511886597, + "step": 1375 + }, + { + "epoch": 0.25109170305676853, + "grad_norm": 0.16165752708911896, + "learning_rate": 4.387927107869928e-05, + "loss": 0.1780426025390625, + "step": 1380 + }, + { + "epoch": 0.25200145560407566, + "grad_norm": 0.17242255806922913, + "learning_rate": 4.383093715935124e-05, + "loss": 0.15959256887435913, + "step": 1385 + }, + { + "epoch": 0.25291120815138285, + "grad_norm": 0.1627114862203598, + "learning_rate": 4.378243999401137e-05, + "loss": 0.17606115341186523, + "step": 1390 + }, + { + "epoch": 0.25382096069869, + "grad_norm": 0.15911224484443665, + "learning_rate": 4.373378000310312e-05, + "loss": 0.16798585653305054, + "step": 1395 + }, + { + "epoch": 0.2547307132459971, + "grad_norm": 0.15542249381542206, + "learning_rate": 4.3684957608461505e-05, + "loss": 0.1695417881011963, + "step": 1400 + }, + { + "epoch": 0.25564046579330424, + "grad_norm": 0.1475304812192917, + "learning_rate": 4.363597323332941e-05, + "loss": 0.16340878009796142, + "step": 1405 + }, + { + "epoch": 0.25655021834061137, + "grad_norm": 0.16943927109241486, + "learning_rate": 4.358682730235395e-05, + "loss": 0.17240238189697266, + "step": 1410 + }, + { + "epoch": 0.2574599708879185, + "grad_norm": 0.1816391944885254, + "learning_rate": 4.3537520241582744e-05, + "loss": 0.16558437347412108, + "step": 1415 + }, + { + "epoch": 0.25836972343522563, + "grad_norm": 0.23851341009140015, + "learning_rate": 4.348805247846027e-05, + "loss": 0.16796000003814698, + "step": 1420 + }, + { + "epoch": 0.25927947598253276, + "grad_norm": 0.15415243804454803, + "learning_rate": 4.343842444182414e-05, + "loss": 0.1746017098426819, + "step": 1425 + }, + { + "epoch": 0.2601892285298399, + "grad_norm": 0.15651032328605652, + "learning_rate": 4.338863656190139e-05, + "loss": 0.1649057984352112, + "step": 1430 + }, + { + "epoch": 0.261098981077147, + "grad_norm": 0.16601966321468353, + "learning_rate": 4.333868927030471e-05, + "loss": 0.15888988971710205, + "step": 1435 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.1549467295408249, + "learning_rate": 4.328858300002876e-05, + "loss": 0.16357985734939576, + "step": 1440 + }, + { + "epoch": 0.2629184861717613, + "grad_norm": 0.16332370042800903, + "learning_rate": 4.32383181854464e-05, + "loss": 0.16749982833862304, + "step": 1445 + }, + { + "epoch": 0.2638282387190684, + "grad_norm": 0.14827077090740204, + "learning_rate": 4.3187895262304894e-05, + "loss": 0.16886214017868043, + "step": 1450 + }, + { + "epoch": 0.26473799126637554, + "grad_norm": 0.1557198166847229, + "learning_rate": 4.313731466772216e-05, + "loss": 0.17512214183807373, + "step": 1455 + }, + { + "epoch": 0.26564774381368267, + "grad_norm": 0.17263570427894592, + "learning_rate": 4.308657684018299e-05, + "loss": 0.16248074769973755, + "step": 1460 + }, + { + "epoch": 0.2665574963609898, + "grad_norm": 0.17135761678218842, + "learning_rate": 4.303568221953521e-05, + "loss": 0.16605921983718872, + "step": 1465 + }, + { + "epoch": 0.26746724890829693, + "grad_norm": 0.14322632551193237, + "learning_rate": 4.2984631246985897e-05, + "loss": 0.1610772728919983, + "step": 1470 + }, + { + "epoch": 0.26837700145560406, + "grad_norm": 0.18852312862873077, + "learning_rate": 4.2933424365097564e-05, + "loss": 0.1686462163925171, + "step": 1475 + }, + { + "epoch": 0.2692867540029112, + "grad_norm": 0.1780245155096054, + "learning_rate": 4.2882062017784294e-05, + "loss": 0.16953932046890258, + "step": 1480 + }, + { + "epoch": 0.2701965065502183, + "grad_norm": 0.180568665266037, + "learning_rate": 4.2830544650307895e-05, + "loss": 0.16442664861679077, + "step": 1485 + }, + { + "epoch": 0.27110625909752545, + "grad_norm": 0.16876435279846191, + "learning_rate": 4.277887270927407e-05, + "loss": 0.17128173112869263, + "step": 1490 + }, + { + "epoch": 0.2720160116448326, + "grad_norm": 0.164053276181221, + "learning_rate": 4.2727046642628513e-05, + "loss": 0.16331382989883422, + "step": 1495 + }, + { + "epoch": 0.27292576419213976, + "grad_norm": 0.14577528834342957, + "learning_rate": 4.267506689965305e-05, + "loss": 0.1638316035270691, + "step": 1500 + }, + { + "epoch": 0.2738355167394469, + "grad_norm": 0.1648740917444229, + "learning_rate": 4.262293393096171e-05, + "loss": 0.15332664251327516, + "step": 1505 + }, + { + "epoch": 0.274745269286754, + "grad_norm": 0.16445094347000122, + "learning_rate": 4.257064818849685e-05, + "loss": 0.1706634521484375, + "step": 1510 + }, + { + "epoch": 0.27565502183406115, + "grad_norm": 0.1584935486316681, + "learning_rate": 4.251821012552524e-05, + "loss": 0.1684114694595337, + "step": 1515 + }, + { + "epoch": 0.2765647743813683, + "grad_norm": 0.17215611040592194, + "learning_rate": 4.24656201966341e-05, + "loss": 0.15594131946563722, + "step": 1520 + }, + { + "epoch": 0.2774745269286754, + "grad_norm": 0.15945589542388916, + "learning_rate": 4.2412878857727214e-05, + "loss": 0.1686659574508667, + "step": 1525 + }, + { + "epoch": 0.27838427947598254, + "grad_norm": 0.16103951632976532, + "learning_rate": 4.2359986566020906e-05, + "loss": 0.17779340744018554, + "step": 1530 + }, + { + "epoch": 0.2792940320232897, + "grad_norm": 0.1770307570695877, + "learning_rate": 4.230694378004014e-05, + "loss": 0.16786882877349854, + "step": 1535 + }, + { + "epoch": 0.2802037845705968, + "grad_norm": 0.16225053369998932, + "learning_rate": 4.2253750959614504e-05, + "loss": 0.16558897495269775, + "step": 1540 + }, + { + "epoch": 0.28111353711790393, + "grad_norm": 0.27213969826698303, + "learning_rate": 4.220040856587425e-05, + "loss": 0.1641119599342346, + "step": 1545 + }, + { + "epoch": 0.28202328966521106, + "grad_norm": 0.1773071587085724, + "learning_rate": 4.2146917061246284e-05, + "loss": 0.16919140815734862, + "step": 1550 + }, + { + "epoch": 0.2829330422125182, + "grad_norm": 0.15519705414772034, + "learning_rate": 4.209327690945014e-05, + "loss": 0.15501506328582765, + "step": 1555 + }, + { + "epoch": 0.2838427947598253, + "grad_norm": 0.19921597838401794, + "learning_rate": 4.203948857549402e-05, + "loss": 0.1690821886062622, + "step": 1560 + }, + { + "epoch": 0.28475254730713245, + "grad_norm": 0.15417630970478058, + "learning_rate": 4.1985552525670696e-05, + "loss": 0.1675640344619751, + "step": 1565 + }, + { + "epoch": 0.2856622998544396, + "grad_norm": 0.1739572137594223, + "learning_rate": 4.193146922755348e-05, + "loss": 0.16738017797470092, + "step": 1570 + }, + { + "epoch": 0.2865720524017467, + "grad_norm": 0.1384361982345581, + "learning_rate": 4.187723914999221e-05, + "loss": 0.16802358627319336, + "step": 1575 + }, + { + "epoch": 0.28748180494905384, + "grad_norm": 0.1491454839706421, + "learning_rate": 4.182286276310915e-05, + "loss": 0.1619583249092102, + "step": 1580 + }, + { + "epoch": 0.288391557496361, + "grad_norm": 0.15831919014453888, + "learning_rate": 4.176834053829492e-05, + "loss": 0.1625199794769287, + "step": 1585 + }, + { + "epoch": 0.2893013100436681, + "grad_norm": 0.16265396773815155, + "learning_rate": 4.1713672948204416e-05, + "loss": 0.16718552112579346, + "step": 1590 + }, + { + "epoch": 0.29021106259097523, + "grad_norm": 0.15153461694717407, + "learning_rate": 4.1658860466752714e-05, + "loss": 0.15979087352752686, + "step": 1595 + }, + { + "epoch": 0.29112081513828236, + "grad_norm": 0.1620412915945053, + "learning_rate": 4.160390356911096e-05, + "loss": 0.16103557348251343, + "step": 1600 + }, + { + "epoch": 0.2920305676855895, + "grad_norm": 0.16673807799816132, + "learning_rate": 4.154880273170223e-05, + "loss": 0.16394708156585694, + "step": 1605 + }, + { + "epoch": 0.2929403202328967, + "grad_norm": 0.14834867417812347, + "learning_rate": 4.149355843219744e-05, + "loss": 0.15916435718536376, + "step": 1610 + }, + { + "epoch": 0.2938500727802038, + "grad_norm": 0.16977964341640472, + "learning_rate": 4.143817114951119e-05, + "loss": 0.16538127660751342, + "step": 1615 + }, + { + "epoch": 0.29475982532751094, + "grad_norm": 0.17986875772476196, + "learning_rate": 4.138264136379756e-05, + "loss": 0.15514618158340454, + "step": 1620 + }, + { + "epoch": 0.29566957787481807, + "grad_norm": 0.15794920921325684, + "learning_rate": 4.132696955644605e-05, + "loss": 0.15992183685302735, + "step": 1625 + }, + { + "epoch": 0.2965793304221252, + "grad_norm": 0.19955399632453918, + "learning_rate": 4.127115621007731e-05, + "loss": 0.16362056732177735, + "step": 1630 + }, + { + "epoch": 0.29748908296943233, + "grad_norm": 0.1352023035287857, + "learning_rate": 4.121520180853903e-05, + "loss": 0.15631601810455323, + "step": 1635 + }, + { + "epoch": 0.29839883551673946, + "grad_norm": 0.15340781211853027, + "learning_rate": 4.1159106836901674e-05, + "loss": 0.1571858048439026, + "step": 1640 + }, + { + "epoch": 0.2993085880640466, + "grad_norm": 0.15311770141124725, + "learning_rate": 4.110287178145433e-05, + "loss": 0.16082344055175782, + "step": 1645 + }, + { + "epoch": 0.3002183406113537, + "grad_norm": 0.17811627686023712, + "learning_rate": 4.10464971297005e-05, + "loss": 0.16117215156555176, + "step": 1650 + }, + { + "epoch": 0.30112809315866085, + "grad_norm": 0.21060039103031158, + "learning_rate": 4.0989983370353805e-05, + "loss": 0.15838587284088135, + "step": 1655 + }, + { + "epoch": 0.302037845705968, + "grad_norm": 0.155836820602417, + "learning_rate": 4.093333099333383e-05, + "loss": 0.16648870706558228, + "step": 1660 + }, + { + "epoch": 0.3029475982532751, + "grad_norm": 0.13711698353290558, + "learning_rate": 4.0876540489761826e-05, + "loss": 0.16899349689483642, + "step": 1665 + }, + { + "epoch": 0.30385735080058224, + "grad_norm": 0.15162716805934906, + "learning_rate": 4.0819612351956485e-05, + "loss": 0.16574090719223022, + "step": 1670 + }, + { + "epoch": 0.30476710334788937, + "grad_norm": 0.15016348659992218, + "learning_rate": 4.0762547073429615e-05, + "loss": 0.1689780354499817, + "step": 1675 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.15182986855506897, + "learning_rate": 4.070534514888194e-05, + "loss": 0.1593686819076538, + "step": 1680 + }, + { + "epoch": 0.3065866084425036, + "grad_norm": 0.15648750960826874, + "learning_rate": 4.0648007074198765e-05, + "loss": 0.16436235904693602, + "step": 1685 + }, + { + "epoch": 0.30749636098981076, + "grad_norm": 0.18339484930038452, + "learning_rate": 4.0590533346445665e-05, + "loss": 0.1678077220916748, + "step": 1690 + }, + { + "epoch": 0.3084061135371179, + "grad_norm": 0.16426527500152588, + "learning_rate": 4.053292446386422e-05, + "loss": 0.1689227342605591, + "step": 1695 + }, + { + "epoch": 0.309315866084425, + "grad_norm": 0.16129335761070251, + "learning_rate": 4.047518092586766e-05, + "loss": 0.16592445373535156, + "step": 1700 + }, + { + "epoch": 0.31022561863173215, + "grad_norm": 0.15512363612651825, + "learning_rate": 4.041730323303654e-05, + "loss": 0.16142364740371704, + "step": 1705 + }, + { + "epoch": 0.3111353711790393, + "grad_norm": 0.159842386841774, + "learning_rate": 4.0359291887114425e-05, + "loss": 0.1702875852584839, + "step": 1710 + }, + { + "epoch": 0.3120451237263464, + "grad_norm": 0.19558854401111603, + "learning_rate": 4.030114739100352e-05, + "loss": 0.15966148376464845, + "step": 1715 + }, + { + "epoch": 0.3129548762736536, + "grad_norm": 0.1577496975660324, + "learning_rate": 4.024287024876029e-05, + "loss": 0.1620358943939209, + "step": 1720 + }, + { + "epoch": 0.3138646288209607, + "grad_norm": 0.1629355251789093, + "learning_rate": 4.0184460965591144e-05, + "loss": 0.16511552333831786, + "step": 1725 + }, + { + "epoch": 0.31477438136826785, + "grad_norm": 0.17060767114162445, + "learning_rate": 4.0125920047848e-05, + "loss": 0.15672838687896729, + "step": 1730 + }, + { + "epoch": 0.315684133915575, + "grad_norm": 0.22447620332241058, + "learning_rate": 4.006724800302394e-05, + "loss": 0.15339784622192382, + "step": 1735 + }, + { + "epoch": 0.3165938864628821, + "grad_norm": 0.14572037756443024, + "learning_rate": 4.000844533974878e-05, + "loss": 0.16566959619522095, + "step": 1740 + }, + { + "epoch": 0.31750363901018924, + "grad_norm": 0.15915483236312866, + "learning_rate": 3.9949512567784684e-05, + "loss": 0.16153957843780517, + "step": 1745 + }, + { + "epoch": 0.3184133915574964, + "grad_norm": 0.1668540984392166, + "learning_rate": 3.9890450198021704e-05, + "loss": 0.1659809947013855, + "step": 1750 + }, + { + "epoch": 0.3193231441048035, + "grad_norm": 0.16612035036087036, + "learning_rate": 3.983125874247341e-05, + "loss": 0.16941241025924683, + "step": 1755 + }, + { + "epoch": 0.32023289665211063, + "grad_norm": 0.15163679420948029, + "learning_rate": 3.9771938714272407e-05, + "loss": 0.16053590774536133, + "step": 1760 + }, + { + "epoch": 0.32114264919941776, + "grad_norm": 0.1797824203968048, + "learning_rate": 3.97124906276659e-05, + "loss": 0.1667110800743103, + "step": 1765 + }, + { + "epoch": 0.3220524017467249, + "grad_norm": 0.15076608955860138, + "learning_rate": 3.9652914998011237e-05, + "loss": 0.1607860803604126, + "step": 1770 + }, + { + "epoch": 0.322962154294032, + "grad_norm": 0.16523587703704834, + "learning_rate": 3.959321234177144e-05, + "loss": 0.16515827178955078, + "step": 1775 + }, + { + "epoch": 0.32387190684133915, + "grad_norm": 0.22065149247646332, + "learning_rate": 3.9533383176510746e-05, + "loss": 0.1618957757949829, + "step": 1780 + }, + { + "epoch": 0.3247816593886463, + "grad_norm": 0.16426463425159454, + "learning_rate": 3.9473428020890066e-05, + "loss": 0.15763382911682128, + "step": 1785 + }, + { + "epoch": 0.3256914119359534, + "grad_norm": 0.16474904119968414, + "learning_rate": 3.941334739466257e-05, + "loss": 0.15135571956634522, + "step": 1790 + }, + { + "epoch": 0.32660116448326054, + "grad_norm": 0.16746412217617035, + "learning_rate": 3.935314181866909e-05, + "loss": 0.15925389528274536, + "step": 1795 + }, + { + "epoch": 0.32751091703056767, + "grad_norm": 0.17819371819496155, + "learning_rate": 3.929281181483369e-05, + "loss": 0.1598669171333313, + "step": 1800 + }, + { + "epoch": 0.3284206695778748, + "grad_norm": 0.1816040277481079, + "learning_rate": 3.923235790615907e-05, + "loss": 0.1652522087097168, + "step": 1805 + }, + { + "epoch": 0.32933042212518193, + "grad_norm": 0.14846695959568024, + "learning_rate": 3.917178061672211e-05, + "loss": 0.16665585041046144, + "step": 1810 + }, + { + "epoch": 0.33024017467248906, + "grad_norm": 0.1734926551580429, + "learning_rate": 3.911108047166924e-05, + "loss": 0.16069791316986085, + "step": 1815 + }, + { + "epoch": 0.3311499272197962, + "grad_norm": 0.16154922544956207, + "learning_rate": 3.905025799721194e-05, + "loss": 0.16114097833633423, + "step": 1820 + }, + { + "epoch": 0.3320596797671033, + "grad_norm": 0.1538771390914917, + "learning_rate": 3.898931372062217e-05, + "loss": 0.1602831244468689, + "step": 1825 + }, + { + "epoch": 0.3329694323144105, + "grad_norm": 0.14036566019058228, + "learning_rate": 3.892824817022781e-05, + "loss": 0.1502395749092102, + "step": 1830 + }, + { + "epoch": 0.33387918486171764, + "grad_norm": 0.19212059676647186, + "learning_rate": 3.886706187540804e-05, + "loss": 0.16265250444412233, + "step": 1835 + }, + { + "epoch": 0.33478893740902477, + "grad_norm": 0.17410333454608917, + "learning_rate": 3.880575536658881e-05, + "loss": 0.15689224004745483, + "step": 1840 + }, + { + "epoch": 0.3356986899563319, + "grad_norm": 0.15165294706821442, + "learning_rate": 3.874432917523817e-05, + "loss": 0.15033140182495117, + "step": 1845 + }, + { + "epoch": 0.336608442503639, + "grad_norm": 0.16166730225086212, + "learning_rate": 3.8682783833861736e-05, + "loss": 0.16896235942840576, + "step": 1850 + }, + { + "epoch": 0.33751819505094616, + "grad_norm": 0.16497021913528442, + "learning_rate": 3.8621119875998026e-05, + "loss": 0.1600774645805359, + "step": 1855 + }, + { + "epoch": 0.3384279475982533, + "grad_norm": 0.17264948785305023, + "learning_rate": 3.855933783621384e-05, + "loss": 0.16947593688964843, + "step": 1860 + }, + { + "epoch": 0.3393377001455604, + "grad_norm": 0.16870704293251038, + "learning_rate": 3.8497438250099636e-05, + "loss": 0.16062095165252685, + "step": 1865 + }, + { + "epoch": 0.34024745269286755, + "grad_norm": 0.16644036769866943, + "learning_rate": 3.843542165426492e-05, + "loss": 0.16015599966049193, + "step": 1870 + }, + { + "epoch": 0.3411572052401747, + "grad_norm": 0.1626352220773697, + "learning_rate": 3.837328858633349e-05, + "loss": 0.17444703578948975, + "step": 1875 + }, + { + "epoch": 0.3420669577874818, + "grad_norm": 0.1427375227212906, + "learning_rate": 3.83110395849389e-05, + "loss": 0.1589805006980896, + "step": 1880 + }, + { + "epoch": 0.34297671033478894, + "grad_norm": 0.17840255796909332, + "learning_rate": 3.824867518971973e-05, + "loss": 0.15953952074050903, + "step": 1885 + }, + { + "epoch": 0.34388646288209607, + "grad_norm": 0.16998249292373657, + "learning_rate": 3.818619594131489e-05, + "loss": 0.16027032136917113, + "step": 1890 + }, + { + "epoch": 0.3447962154294032, + "grad_norm": 0.14950257539749146, + "learning_rate": 3.812360238135897e-05, + "loss": 0.15335670709609986, + "step": 1895 + }, + { + "epoch": 0.3457059679767103, + "grad_norm": 0.1678011417388916, + "learning_rate": 3.806089505247752e-05, + "loss": 0.1560648798942566, + "step": 1900 + }, + { + "epoch": 0.34661572052401746, + "grad_norm": 0.17944541573524475, + "learning_rate": 3.799807449828238e-05, + "loss": 0.16072254180908202, + "step": 1905 + }, + { + "epoch": 0.3475254730713246, + "grad_norm": 0.166817307472229, + "learning_rate": 3.793514126336691e-05, + "loss": 0.1542820692062378, + "step": 1910 + }, + { + "epoch": 0.3484352256186317, + "grad_norm": 0.16047626733779907, + "learning_rate": 3.787209589330134e-05, + "loss": 0.16092092990875245, + "step": 1915 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.16478900611400604, + "learning_rate": 3.7808938934627965e-05, + "loss": 0.16765867471694945, + "step": 1920 + }, + { + "epoch": 0.350254730713246, + "grad_norm": 0.15349514782428741, + "learning_rate": 3.774567093485648e-05, + "loss": 0.15890377759933472, + "step": 1925 + }, + { + "epoch": 0.3511644832605531, + "grad_norm": 0.1515921950340271, + "learning_rate": 3.768229244245917e-05, + "loss": 0.16668319702148438, + "step": 1930 + }, + { + "epoch": 0.35207423580786024, + "grad_norm": 0.16310466825962067, + "learning_rate": 3.7618804006866195e-05, + "loss": 0.15182652473449706, + "step": 1935 + }, + { + "epoch": 0.3529839883551674, + "grad_norm": 0.17294517159461975, + "learning_rate": 3.755520617846084e-05, + "loss": 0.16287628412246705, + "step": 1940 + }, + { + "epoch": 0.35389374090247455, + "grad_norm": 0.1482895463705063, + "learning_rate": 3.749149950857467e-05, + "loss": 0.15321952104568481, + "step": 1945 + }, + { + "epoch": 0.3548034934497817, + "grad_norm": 0.2236029952764511, + "learning_rate": 3.7427684549482847e-05, + "loss": 0.15403482913970948, + "step": 1950 + }, + { + "epoch": 0.3557132459970888, + "grad_norm": 0.20185327529907227, + "learning_rate": 3.736376185439927e-05, + "loss": 0.1633884072303772, + "step": 1955 + }, + { + "epoch": 0.35662299854439594, + "grad_norm": 0.13906247913837433, + "learning_rate": 3.7299731977471816e-05, + "loss": 0.15925350189208984, + "step": 1960 + }, + { + "epoch": 0.35753275109170307, + "grad_norm": 0.18665002286434174, + "learning_rate": 3.723559547377751e-05, + "loss": 0.1612026572227478, + "step": 1965 + }, + { + "epoch": 0.3584425036390102, + "grad_norm": 0.16913433372974396, + "learning_rate": 3.717135289931774e-05, + "loss": 0.15479494333267213, + "step": 1970 + }, + { + "epoch": 0.35935225618631733, + "grad_norm": 0.1620066910982132, + "learning_rate": 3.7107004811013434e-05, + "loss": 0.1604058027267456, + "step": 1975 + }, + { + "epoch": 0.36026200873362446, + "grad_norm": 0.16838301718235016, + "learning_rate": 3.704255176670021e-05, + "loss": 0.15335073471069335, + "step": 1980 + }, + { + "epoch": 0.3611717612809316, + "grad_norm": 0.3054695427417755, + "learning_rate": 3.6977994325123535e-05, + "loss": 0.16558053493499755, + "step": 1985 + }, + { + "epoch": 0.3620815138282387, + "grad_norm": 0.1526716649532318, + "learning_rate": 3.6913333045933934e-05, + "loss": 0.16148923635482787, + "step": 1990 + }, + { + "epoch": 0.36299126637554585, + "grad_norm": 0.15328513085842133, + "learning_rate": 3.684856848968209e-05, + "loss": 0.1553613781929016, + "step": 1995 + }, + { + "epoch": 0.363901018922853, + "grad_norm": 0.16129714250564575, + "learning_rate": 3.6783701217813995e-05, + "loss": 0.16724612712860107, + "step": 2000 + }, + { + "epoch": 0.3648107714701601, + "grad_norm": 0.15715539455413818, + "learning_rate": 3.6718731792666086e-05, + "loss": 0.15867922306060792, + "step": 2005 + }, + { + "epoch": 0.36572052401746724, + "grad_norm": 0.15569166839122772, + "learning_rate": 3.6653660777460366e-05, + "loss": 0.1552058696746826, + "step": 2010 + }, + { + "epoch": 0.36663027656477437, + "grad_norm": 0.16223010420799255, + "learning_rate": 3.6588488736299535e-05, + "loss": 0.1583200454711914, + "step": 2015 + }, + { + "epoch": 0.3675400291120815, + "grad_norm": 0.18441995978355408, + "learning_rate": 3.652321623416209e-05, + "loss": 0.15050662755966188, + "step": 2020 + }, + { + "epoch": 0.36844978165938863, + "grad_norm": 0.13792674243450165, + "learning_rate": 3.645784383689742e-05, + "loss": 0.15458759069442748, + "step": 2025 + }, + { + "epoch": 0.36935953420669576, + "grad_norm": 0.14993111789226532, + "learning_rate": 3.639237211122091e-05, + "loss": 0.15926222801208495, + "step": 2030 + }, + { + "epoch": 0.3702692867540029, + "grad_norm": 0.16815930604934692, + "learning_rate": 3.632680162470904e-05, + "loss": 0.15524441003799438, + "step": 2035 + }, + { + "epoch": 0.37117903930131, + "grad_norm": 0.13312821090221405, + "learning_rate": 3.626113294579441e-05, + "loss": 0.15883516073226928, + "step": 2040 + }, + { + "epoch": 0.37208879184861715, + "grad_norm": 0.16838273406028748, + "learning_rate": 3.619536664376091e-05, + "loss": 0.15829603672027587, + "step": 2045 + }, + { + "epoch": 0.37299854439592434, + "grad_norm": 0.14706873893737793, + "learning_rate": 3.612950328873869e-05, + "loss": 0.15644397735595703, + "step": 2050 + }, + { + "epoch": 0.37390829694323147, + "grad_norm": 0.1644199639558792, + "learning_rate": 3.606354345169926e-05, + "loss": 0.15858219861984252, + "step": 2055 + }, + { + "epoch": 0.3748180494905386, + "grad_norm": 0.18077051639556885, + "learning_rate": 3.599748770445055e-05, + "loss": 0.1641286849975586, + "step": 2060 + }, + { + "epoch": 0.3757278020378457, + "grad_norm": 0.16329127550125122, + "learning_rate": 3.5931336619631914e-05, + "loss": 0.15027186870574952, + "step": 2065 + }, + { + "epoch": 0.37663755458515286, + "grad_norm": 0.16346783936023712, + "learning_rate": 3.586509077070922e-05, + "loss": 0.1558641314506531, + "step": 2070 + }, + { + "epoch": 0.37754730713246, + "grad_norm": 0.1727602630853653, + "learning_rate": 3.5798750731969834e-05, + "loss": 0.15390506982803345, + "step": 2075 + }, + { + "epoch": 0.3784570596797671, + "grad_norm": 0.7598192691802979, + "learning_rate": 3.5732317078517654e-05, + "loss": 0.1533232808113098, + "step": 2080 + }, + { + "epoch": 0.37936681222707425, + "grad_norm": 0.1433355212211609, + "learning_rate": 3.5665790386268124e-05, + "loss": 0.15560413599014283, + "step": 2085 + }, + { + "epoch": 0.3802765647743814, + "grad_norm": 0.18439625203609467, + "learning_rate": 3.559917123194325e-05, + "loss": 0.16695556640625, + "step": 2090 + }, + { + "epoch": 0.3811863173216885, + "grad_norm": 0.1693502813577652, + "learning_rate": 3.55324601930666e-05, + "loss": 0.15957870483398437, + "step": 2095 + }, + { + "epoch": 0.38209606986899564, + "grad_norm": 0.17776088416576385, + "learning_rate": 3.54656578479583e-05, + "loss": 0.1527492880821228, + "step": 2100 + }, + { + "epoch": 0.38300582241630277, + "grad_norm": 0.15993724763393402, + "learning_rate": 3.539876477572998e-05, + "loss": 0.1567505717277527, + "step": 2105 + }, + { + "epoch": 0.3839155749636099, + "grad_norm": 0.17067375779151917, + "learning_rate": 3.533178155627981e-05, + "loss": 0.14660797119140626, + "step": 2110 + }, + { + "epoch": 0.384825327510917, + "grad_norm": 0.20239882171154022, + "learning_rate": 3.526470877028745e-05, + "loss": 0.1596767544746399, + "step": 2115 + }, + { + "epoch": 0.38573508005822416, + "grad_norm": 0.1863643079996109, + "learning_rate": 3.5197546999209005e-05, + "loss": 0.15738571882247926, + "step": 2120 + }, + { + "epoch": 0.3866448326055313, + "grad_norm": 0.16994133591651917, + "learning_rate": 3.5130296825272014e-05, + "loss": 0.16255316734313965, + "step": 2125 + }, + { + "epoch": 0.3875545851528384, + "grad_norm": 0.18703415989875793, + "learning_rate": 3.5062958831470355e-05, + "loss": 0.15206334590911866, + "step": 2130 + }, + { + "epoch": 0.38846433770014555, + "grad_norm": 0.15433982014656067, + "learning_rate": 3.4995533601559226e-05, + "loss": 0.1590178370475769, + "step": 2135 + }, + { + "epoch": 0.3893740902474527, + "grad_norm": 0.16498146951198578, + "learning_rate": 3.4928021720050104e-05, + "loss": 0.14759145975112914, + "step": 2140 + }, + { + "epoch": 0.3902838427947598, + "grad_norm": 0.17880478501319885, + "learning_rate": 3.486042377220562e-05, + "loss": 0.1642458915710449, + "step": 2145 + }, + { + "epoch": 0.39119359534206694, + "grad_norm": 0.14700061082839966, + "learning_rate": 3.479274034403455e-05, + "loss": 0.16105138063430785, + "step": 2150 + }, + { + "epoch": 0.39210334788937407, + "grad_norm": 0.1620762050151825, + "learning_rate": 3.472497202228664e-05, + "loss": 0.15104985237121582, + "step": 2155 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.1625058799982071, + "learning_rate": 3.4657119394447654e-05, + "loss": 0.16145485639572144, + "step": 2160 + }, + { + "epoch": 0.3939228529839884, + "grad_norm": 0.1631549596786499, + "learning_rate": 3.458918304873417e-05, + "loss": 0.16712255477905275, + "step": 2165 + }, + { + "epoch": 0.3948326055312955, + "grad_norm": 0.16041551530361176, + "learning_rate": 3.452116357408853e-05, + "loss": 0.15118330717086792, + "step": 2170 + }, + { + "epoch": 0.39574235807860264, + "grad_norm": 0.16692611575126648, + "learning_rate": 3.44530615601737e-05, + "loss": 0.16982550621032716, + "step": 2175 + }, + { + "epoch": 0.39665211062590977, + "grad_norm": 0.16082268953323364, + "learning_rate": 3.438487759736821e-05, + "loss": 0.1513260006904602, + "step": 2180 + }, + { + "epoch": 0.3975618631732169, + "grad_norm": 0.1474589854478836, + "learning_rate": 3.4316612276761004e-05, + "loss": 0.14968743324279785, + "step": 2185 + }, + { + "epoch": 0.39847161572052403, + "grad_norm": 0.14531342685222626, + "learning_rate": 3.42482661901463e-05, + "loss": 0.1563260555267334, + "step": 2190 + }, + { + "epoch": 0.39938136826783116, + "grad_norm": 0.16775506734848022, + "learning_rate": 3.41798399300185e-05, + "loss": 0.14861010313034057, + "step": 2195 + }, + { + "epoch": 0.4002911208151383, + "grad_norm": 0.15065217018127441, + "learning_rate": 3.411133408956703e-05, + "loss": 0.15559519529342652, + "step": 2200 + }, + { + "epoch": 0.4012008733624454, + "grad_norm": 0.16655296087265015, + "learning_rate": 3.4042749262671184e-05, + "loss": 0.16025567054748535, + "step": 2205 + }, + { + "epoch": 0.40211062590975255, + "grad_norm": 0.14773905277252197, + "learning_rate": 3.397408604389501e-05, + "loss": 0.15074082612991332, + "step": 2210 + }, + { + "epoch": 0.4030203784570597, + "grad_norm": 0.16233304142951965, + "learning_rate": 3.3905345028482125e-05, + "loss": 0.15490520000457764, + "step": 2215 + }, + { + "epoch": 0.4039301310043668, + "grad_norm": 0.17520153522491455, + "learning_rate": 3.383652681235058e-05, + "loss": 0.1517520785331726, + "step": 2220 + }, + { + "epoch": 0.40483988355167394, + "grad_norm": 0.14749875664710999, + "learning_rate": 3.376763199208766e-05, + "loss": 0.15410997867584228, + "step": 2225 + }, + { + "epoch": 0.40574963609898107, + "grad_norm": 0.16855919361114502, + "learning_rate": 3.369866116494477e-05, + "loss": 0.1510261058807373, + "step": 2230 + }, + { + "epoch": 0.4066593886462882, + "grad_norm": 0.1594122350215912, + "learning_rate": 3.362961492883218e-05, + "loss": 0.1493813395500183, + "step": 2235 + }, + { + "epoch": 0.40756914119359533, + "grad_norm": 0.13645926117897034, + "learning_rate": 3.3560493882313915e-05, + "loss": 0.14876762628555298, + "step": 2240 + }, + { + "epoch": 0.40847889374090246, + "grad_norm": 0.14304400980472565, + "learning_rate": 3.349129862460251e-05, + "loss": 0.15567013025283813, + "step": 2245 + }, + { + "epoch": 0.4093886462882096, + "grad_norm": 0.17040041089057922, + "learning_rate": 3.342202975555386e-05, + "loss": 0.1563249945640564, + "step": 2250 + }, + { + "epoch": 0.4102983988355167, + "grad_norm": 0.15594671666622162, + "learning_rate": 3.3352687875661984e-05, + "loss": 0.1546410083770752, + "step": 2255 + }, + { + "epoch": 0.41120815138282385, + "grad_norm": 0.1677195280790329, + "learning_rate": 3.328327358605384e-05, + "loss": 0.15710171461105346, + "step": 2260 + }, + { + "epoch": 0.412117903930131, + "grad_norm": 0.1731705516576767, + "learning_rate": 3.321378748848412e-05, + "loss": 0.16444036960601807, + "step": 2265 + }, + { + "epoch": 0.4130276564774381, + "grad_norm": 0.18779033422470093, + "learning_rate": 3.3144230185329984e-05, + "loss": 0.15659687519073487, + "step": 2270 + }, + { + "epoch": 0.4139374090247453, + "grad_norm": 0.1543768346309662, + "learning_rate": 3.3074602279585913e-05, + "loss": 0.15100739002227784, + "step": 2275 + }, + { + "epoch": 0.4148471615720524, + "grad_norm": 0.16672168672084808, + "learning_rate": 3.300490437485843e-05, + "loss": 0.15535364151000977, + "step": 2280 + }, + { + "epoch": 0.41575691411935956, + "grad_norm": 0.16741308569908142, + "learning_rate": 3.293513707536089e-05, + "loss": 0.15523911714553834, + "step": 2285 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.1488303542137146, + "learning_rate": 3.286530098590822e-05, + "loss": 0.1542000651359558, + "step": 2290 + }, + { + "epoch": 0.4175764192139738, + "grad_norm": 0.1637732982635498, + "learning_rate": 3.2795396711911694e-05, + "loss": 0.15354831218719484, + "step": 2295 + }, + { + "epoch": 0.41848617176128095, + "grad_norm": 0.1472022533416748, + "learning_rate": 3.272542485937369e-05, + "loss": 0.16235145330429077, + "step": 2300 + }, + { + "epoch": 0.4193959243085881, + "grad_norm": 0.15908290445804596, + "learning_rate": 3.265538603488241e-05, + "loss": 0.15642645359039306, + "step": 2305 + }, + { + "epoch": 0.4203056768558952, + "grad_norm": 0.1584865301847458, + "learning_rate": 3.2585280845606645e-05, + "loss": 0.15490249395370484, + "step": 2310 + }, + { + "epoch": 0.42121542940320233, + "grad_norm": 0.15893949568271637, + "learning_rate": 3.251510989929052e-05, + "loss": 0.1598116159439087, + "step": 2315 + }, + { + "epoch": 0.42212518195050946, + "grad_norm": 0.18930596113204956, + "learning_rate": 3.244487380424817e-05, + "loss": 0.1482008934020996, + "step": 2320 + }, + { + "epoch": 0.4230349344978166, + "grad_norm": 0.132876455783844, + "learning_rate": 3.237457316935856e-05, + "loss": 0.15304710865020751, + "step": 2325 + }, + { + "epoch": 0.4239446870451237, + "grad_norm": 0.16447032988071442, + "learning_rate": 3.2304208604060106e-05, + "loss": 0.15298750400543212, + "step": 2330 + }, + { + "epoch": 0.42485443959243085, + "grad_norm": 0.17748120427131653, + "learning_rate": 3.223378071834546e-05, + "loss": 0.1556084156036377, + "step": 2335 + }, + { + "epoch": 0.425764192139738, + "grad_norm": 0.16366586089134216, + "learning_rate": 3.2163290122756206e-05, + "loss": 0.14387927055358887, + "step": 2340 + }, + { + "epoch": 0.4266739446870451, + "grad_norm": 0.15398970246315002, + "learning_rate": 3.209273742837755e-05, + "loss": 0.16091293096542358, + "step": 2345 + }, + { + "epoch": 0.42758369723435224, + "grad_norm": 0.164212167263031, + "learning_rate": 3.202212324683305e-05, + "loss": 0.15523531436920165, + "step": 2350 + }, + { + "epoch": 0.4284934497816594, + "grad_norm": 0.16749800741672516, + "learning_rate": 3.1951448190279255e-05, + "loss": 0.15354975461959838, + "step": 2355 + }, + { + "epoch": 0.4294032023289665, + "grad_norm": 0.14137034118175507, + "learning_rate": 3.18807128714005e-05, + "loss": 0.14981694221496583, + "step": 2360 + }, + { + "epoch": 0.43031295487627363, + "grad_norm": 0.14848439395427704, + "learning_rate": 3.1809917903403507e-05, + "loss": 0.15448769330978393, + "step": 2365 + }, + { + "epoch": 0.43122270742358076, + "grad_norm": 0.1747605800628662, + "learning_rate": 3.1739063900012095e-05, + "loss": 0.15882387161254882, + "step": 2370 + }, + { + "epoch": 0.4321324599708879, + "grad_norm": 0.16054467856884003, + "learning_rate": 3.166815147546186e-05, + "loss": 0.15170297622680665, + "step": 2375 + }, + { + "epoch": 0.433042212518195, + "grad_norm": 0.15428027510643005, + "learning_rate": 3.1597181244494886e-05, + "loss": 0.16202548742294312, + "step": 2380 + }, + { + "epoch": 0.4339519650655022, + "grad_norm": 0.16747219860553741, + "learning_rate": 3.1526153822354325e-05, + "loss": 0.15461477041244506, + "step": 2385 + }, + { + "epoch": 0.43486171761280934, + "grad_norm": 0.17415772378444672, + "learning_rate": 3.145506982477918e-05, + "loss": 0.16173542737960817, + "step": 2390 + }, + { + "epoch": 0.43577147016011647, + "grad_norm": 0.1293518990278244, + "learning_rate": 3.1383929867998865e-05, + "loss": 0.15572521686553956, + "step": 2395 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.16909323632717133, + "learning_rate": 3.1312734568727935e-05, + "loss": 0.15898628234863282, + "step": 2400 + }, + { + "epoch": 0.43759097525473073, + "grad_norm": 0.16770294308662415, + "learning_rate": 3.124148454416069e-05, + "loss": 0.1536281704902649, + "step": 2405 + }, + { + "epoch": 0.43850072780203786, + "grad_norm": 0.14078612625598907, + "learning_rate": 3.117018041196585e-05, + "loss": 0.15274266004562378, + "step": 2410 + }, + { + "epoch": 0.439410480349345, + "grad_norm": 0.15457536280155182, + "learning_rate": 3.1098822790281226e-05, + "loss": 0.15391263961791993, + "step": 2415 + }, + { + "epoch": 0.4403202328966521, + "grad_norm": 0.1640717089176178, + "learning_rate": 3.102741229770827e-05, + "loss": 0.15515168905258178, + "step": 2420 + }, + { + "epoch": 0.44122998544395925, + "grad_norm": 0.2601533830165863, + "learning_rate": 3.095594955330683e-05, + "loss": 0.1587247371673584, + "step": 2425 + }, + { + "epoch": 0.4421397379912664, + "grad_norm": 0.1352529525756836, + "learning_rate": 3.08844351765897e-05, + "loss": 0.1483217477798462, + "step": 2430 + }, + { + "epoch": 0.4430494905385735, + "grad_norm": 0.18479721248149872, + "learning_rate": 3.081286978751728e-05, + "loss": 0.15121787786483765, + "step": 2435 + }, + { + "epoch": 0.44395924308588064, + "grad_norm": 0.16954511404037476, + "learning_rate": 3.074125400649221e-05, + "loss": 0.16073100566864013, + "step": 2440 + }, + { + "epoch": 0.44486899563318777, + "grad_norm": 0.15154729783535004, + "learning_rate": 3.0669588454353944e-05, + "loss": 0.15738017559051515, + "step": 2445 + }, + { + "epoch": 0.4457787481804949, + "grad_norm": 0.1540488302707672, + "learning_rate": 3.059787375237344e-05, + "loss": 0.1515384554862976, + "step": 2450 + }, + { + "epoch": 0.44668850072780203, + "grad_norm": 0.1814432442188263, + "learning_rate": 3.052611052224774e-05, + "loss": 0.15731438398361205, + "step": 2455 + }, + { + "epoch": 0.44759825327510916, + "grad_norm": 0.16657036542892456, + "learning_rate": 3.0454299386094542e-05, + "loss": 0.15741543769836425, + "step": 2460 + }, + { + "epoch": 0.4485080058224163, + "grad_norm": 0.2177237570285797, + "learning_rate": 3.0382440966446875e-05, + "loss": 0.14972515106201173, + "step": 2465 + }, + { + "epoch": 0.4494177583697234, + "grad_norm": 0.1669909954071045, + "learning_rate": 3.031053588624766e-05, + "loss": 0.1506432294845581, + "step": 2470 + }, + { + "epoch": 0.45032751091703055, + "grad_norm": 0.1752234250307083, + "learning_rate": 3.0238584768844313e-05, + "loss": 0.14969609975814818, + "step": 2475 + }, + { + "epoch": 0.4512372634643377, + "grad_norm": 0.18267901241779327, + "learning_rate": 3.0166588237983363e-05, + "loss": 0.15112748146057128, + "step": 2480 + }, + { + "epoch": 0.4521470160116448, + "grad_norm": 0.16250105202198029, + "learning_rate": 3.0094546917805007e-05, + "loss": 0.15864100456237792, + "step": 2485 + }, + { + "epoch": 0.45305676855895194, + "grad_norm": 0.14825721085071564, + "learning_rate": 3.0022461432837752e-05, + "loss": 0.1513954520225525, + "step": 2490 + }, + { + "epoch": 0.4539665211062591, + "grad_norm": 0.1626640111207962, + "learning_rate": 2.9950332407992943e-05, + "loss": 0.1505578875541687, + "step": 2495 + }, + { + "epoch": 0.45487627365356625, + "grad_norm": 0.1535351574420929, + "learning_rate": 2.987816046855939e-05, + "loss": 0.15255829095840454, + "step": 2500 + }, + { + "epoch": 0.4557860262008734, + "grad_norm": 0.17552775144577026, + "learning_rate": 2.9805946240197928e-05, + "loss": 0.1516443133354187, + "step": 2505 + }, + { + "epoch": 0.4566957787481805, + "grad_norm": 0.16020981967449188, + "learning_rate": 2.9733690348935994e-05, + "loss": 0.14519743919372557, + "step": 2510 + }, + { + "epoch": 0.45760553129548764, + "grad_norm": 0.17800211906433105, + "learning_rate": 2.9661393421162204e-05, + "loss": 0.15679080486297609, + "step": 2515 + }, + { + "epoch": 0.4585152838427948, + "grad_norm": 0.16016991436481476, + "learning_rate": 2.9589056083620902e-05, + "loss": 0.14768127202987671, + "step": 2520 + }, + { + "epoch": 0.4594250363901019, + "grad_norm": 0.16272081434726715, + "learning_rate": 2.951667896340679e-05, + "loss": 0.1513301968574524, + "step": 2525 + }, + { + "epoch": 0.46033478893740903, + "grad_norm": 0.1726413071155548, + "learning_rate": 2.9444262687959402e-05, + "loss": 0.14819332361221313, + "step": 2530 + }, + { + "epoch": 0.46124454148471616, + "grad_norm": 0.1670403778553009, + "learning_rate": 2.9371807885057735e-05, + "loss": 0.15245940685272216, + "step": 2535 + }, + { + "epoch": 0.4621542940320233, + "grad_norm": 0.1650049239397049, + "learning_rate": 2.9299315182814772e-05, + "loss": 0.15187418460845947, + "step": 2540 + }, + { + "epoch": 0.4630640465793304, + "grad_norm": 0.16327734291553497, + "learning_rate": 2.9226785209672047e-05, + "loss": 0.15579828023910522, + "step": 2545 + }, + { + "epoch": 0.46397379912663755, + "grad_norm": 0.3367880582809448, + "learning_rate": 2.91542185943942e-05, + "loss": 0.15617697238922118, + "step": 2550 + }, + { + "epoch": 0.4648835516739447, + "grad_norm": 0.1731594055891037, + "learning_rate": 2.908161596606353e-05, + "loss": 0.1559603691101074, + "step": 2555 + }, + { + "epoch": 0.4657933042212518, + "grad_norm": 0.1477293074131012, + "learning_rate": 2.9008977954074517e-05, + "loss": 0.15567959547042848, + "step": 2560 + }, + { + "epoch": 0.46670305676855894, + "grad_norm": 0.16227173805236816, + "learning_rate": 2.8936305188128392e-05, + "loss": 0.1522113561630249, + "step": 2565 + }, + { + "epoch": 0.4676128093158661, + "grad_norm": 0.2031075656414032, + "learning_rate": 2.8863598298227674e-05, + "loss": 0.15054640769958497, + "step": 2570 + }, + { + "epoch": 0.4685225618631732, + "grad_norm": 0.18351472914218903, + "learning_rate": 2.8790857914670698e-05, + "loss": 0.15837019681930542, + "step": 2575 + }, + { + "epoch": 0.46943231441048033, + "grad_norm": 0.15914765000343323, + "learning_rate": 2.871808466804616e-05, + "loss": 0.1550259470939636, + "step": 2580 + }, + { + "epoch": 0.47034206695778746, + "grad_norm": 0.17366717755794525, + "learning_rate": 2.8645279189227636e-05, + "loss": 0.15702390670776367, + "step": 2585 + }, + { + "epoch": 0.4712518195050946, + "grad_norm": 0.13677838444709778, + "learning_rate": 2.8572442109368134e-05, + "loss": 0.15485031604766847, + "step": 2590 + }, + { + "epoch": 0.4721615720524017, + "grad_norm": 0.1477748304605484, + "learning_rate": 2.8499574059894617e-05, + "loss": 0.14577245712280273, + "step": 2595 + }, + { + "epoch": 0.47307132459970885, + "grad_norm": 0.1582217663526535, + "learning_rate": 2.842667567250252e-05, + "loss": 0.15586793422698975, + "step": 2600 + }, + { + "epoch": 0.47398107714701604, + "grad_norm": 0.19658738374710083, + "learning_rate": 2.8353747579150268e-05, + "loss": 0.15060495138168334, + "step": 2605 + }, + { + "epoch": 0.47489082969432317, + "grad_norm": 0.176767036318779, + "learning_rate": 2.828079041205382e-05, + "loss": 0.15116705894470214, + "step": 2610 + }, + { + "epoch": 0.4758005822416303, + "grad_norm": 0.16972507536411285, + "learning_rate": 2.820780480368117e-05, + "loss": 0.1541937470436096, + "step": 2615 + }, + { + "epoch": 0.47671033478893743, + "grad_norm": 0.1548585742712021, + "learning_rate": 2.8134791386746884e-05, + "loss": 0.14334756135940552, + "step": 2620 + }, + { + "epoch": 0.47762008733624456, + "grad_norm": 0.15411986410617828, + "learning_rate": 2.806175079420658e-05, + "loss": 0.14642289876937867, + "step": 2625 + }, + { + "epoch": 0.4785298398835517, + "grad_norm": 0.16609491407871246, + "learning_rate": 2.7988683659251474e-05, + "loss": 0.15083469152450563, + "step": 2630 + }, + { + "epoch": 0.4794395924308588, + "grad_norm": 0.16592684388160706, + "learning_rate": 2.791559061530289e-05, + "loss": 0.14218480587005616, + "step": 2635 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.1764935404062271, + "learning_rate": 2.7842472296006722e-05, + "loss": 0.15004343986511232, + "step": 2640 + }, + { + "epoch": 0.4812590975254731, + "grad_norm": 0.20094354450702667, + "learning_rate": 2.7769329335228022e-05, + "loss": 0.14975016117095946, + "step": 2645 + }, + { + "epoch": 0.4821688500727802, + "grad_norm": 0.1869269460439682, + "learning_rate": 2.769616236704542e-05, + "loss": 0.155981707572937, + "step": 2650 + }, + { + "epoch": 0.48307860262008734, + "grad_norm": 0.16671574115753174, + "learning_rate": 2.762297202574571e-05, + "loss": 0.14633859395980836, + "step": 2655 + }, + { + "epoch": 0.48398835516739447, + "grad_norm": 0.14999663829803467, + "learning_rate": 2.754975894581826e-05, + "loss": 0.15692603588104248, + "step": 2660 + }, + { + "epoch": 0.4848981077147016, + "grad_norm": 0.16893649101257324, + "learning_rate": 2.7476523761949592e-05, + "loss": 0.14530394077301026, + "step": 2665 + }, + { + "epoch": 0.48580786026200873, + "grad_norm": 0.16039884090423584, + "learning_rate": 2.740326710901784e-05, + "loss": 0.15013915300369263, + "step": 2670 + }, + { + "epoch": 0.48671761280931586, + "grad_norm": 0.16672006249427795, + "learning_rate": 2.732998962208725e-05, + "loss": 0.15667349100112915, + "step": 2675 + }, + { + "epoch": 0.487627365356623, + "grad_norm": 0.2160867303609848, + "learning_rate": 2.7256691936402684e-05, + "loss": 0.14335414171218872, + "step": 2680 + }, + { + "epoch": 0.4885371179039301, + "grad_norm": 0.349030077457428, + "learning_rate": 2.71833746873841e-05, + "loss": 0.1437530279159546, + "step": 2685 + }, + { + "epoch": 0.48944687045123725, + "grad_norm": 0.18380966782569885, + "learning_rate": 2.7110038510621073e-05, + "loss": 0.1476014256477356, + "step": 2690 + }, + { + "epoch": 0.4903566229985444, + "grad_norm": 0.1523742377758026, + "learning_rate": 2.703668404186722e-05, + "loss": 0.14578526020050048, + "step": 2695 + }, + { + "epoch": 0.4912663755458515, + "grad_norm": 0.16092729568481445, + "learning_rate": 2.696331191703479e-05, + "loss": 0.15335593223571778, + "step": 2700 + }, + { + "epoch": 0.49217612809315864, + "grad_norm": 0.17185333371162415, + "learning_rate": 2.688992277218904e-05, + "loss": 0.1540898084640503, + "step": 2705 + }, + { + "epoch": 0.49308588064046577, + "grad_norm": 0.1521969735622406, + "learning_rate": 2.6816517243542792e-05, + "loss": 0.15171396732330322, + "step": 2710 + }, + { + "epoch": 0.49399563318777295, + "grad_norm": 0.16064171493053436, + "learning_rate": 2.674309596745092e-05, + "loss": 0.1505839228630066, + "step": 2715 + }, + { + "epoch": 0.4949053857350801, + "grad_norm": 0.16430898010730743, + "learning_rate": 2.6669659580404795e-05, + "loss": 0.1551363468170166, + "step": 2720 + }, + { + "epoch": 0.4958151382823872, + "grad_norm": 0.16125477850437164, + "learning_rate": 2.659620871902677e-05, + "loss": 0.15069286823272704, + "step": 2725 + }, + { + "epoch": 0.49672489082969434, + "grad_norm": 0.1428450047969818, + "learning_rate": 2.652274402006471e-05, + "loss": 0.15511081218719483, + "step": 2730 + }, + { + "epoch": 0.4976346433770015, + "grad_norm": 0.15452754497528076, + "learning_rate": 2.6449266120386406e-05, + "loss": 0.14941939115524291, + "step": 2735 + }, + { + "epoch": 0.4985443959243086, + "grad_norm": 0.17243537306785583, + "learning_rate": 2.6375775656974123e-05, + "loss": 0.151741623878479, + "step": 2740 + }, + { + "epoch": 0.49945414847161573, + "grad_norm": 0.13736453652381897, + "learning_rate": 2.6302273266919008e-05, + "loss": 0.147042977809906, + "step": 2745 + }, + { + "epoch": 0.5003639010189228, + "grad_norm": 0.16241495311260223, + "learning_rate": 2.6228759587415614e-05, + "loss": 0.14664684534072875, + "step": 2750 + }, + { + "epoch": 0.50127365356623, + "grad_norm": 0.193496435880661, + "learning_rate": 2.6155235255756356e-05, + "loss": 0.15486966371536254, + "step": 2755 + }, + { + "epoch": 0.5021834061135371, + "grad_norm": 0.1542847901582718, + "learning_rate": 2.6081700909326e-05, + "loss": 0.15148009061813356, + "step": 2760 + }, + { + "epoch": 0.5030931586608443, + "grad_norm": 0.1696511209011078, + "learning_rate": 2.6008157185596142e-05, + "loss": 0.14190055131912233, + "step": 2765 + }, + { + "epoch": 0.5040029112081513, + "grad_norm": 0.14690077304840088, + "learning_rate": 2.5934604722119655e-05, + "loss": 0.1570739269256592, + "step": 2770 + }, + { + "epoch": 0.5049126637554585, + "grad_norm": 0.17149671912193298, + "learning_rate": 2.5861044156525162e-05, + "loss": 0.14940304756164552, + "step": 2775 + }, + { + "epoch": 0.5058224163027657, + "grad_norm": 0.16639231145381927, + "learning_rate": 2.578747612651155e-05, + "loss": 0.15691237449645995, + "step": 2780 + }, + { + "epoch": 0.5067321688500728, + "grad_norm": 0.2062763124704361, + "learning_rate": 2.5713901269842404e-05, + "loss": 0.1564734935760498, + "step": 2785 + }, + { + "epoch": 0.50764192139738, + "grad_norm": 0.12636308372020721, + "learning_rate": 2.5640320224340502e-05, + "loss": 0.14539417028427123, + "step": 2790 + }, + { + "epoch": 0.508551673944687, + "grad_norm": 0.16893689334392548, + "learning_rate": 2.556673362788225e-05, + "loss": 0.15440930128097535, + "step": 2795 + }, + { + "epoch": 0.5094614264919942, + "grad_norm": 0.16250015795230865, + "learning_rate": 2.54931421183922e-05, + "loss": 0.14485647678375244, + "step": 2800 + }, + { + "epoch": 0.5103711790393013, + "grad_norm": 0.1700994372367859, + "learning_rate": 2.5419546333837462e-05, + "loss": 0.15411126613616943, + "step": 2805 + }, + { + "epoch": 0.5112809315866085, + "grad_norm": 0.1547706127166748, + "learning_rate": 2.5345946912222256e-05, + "loss": 0.15516072511672974, + "step": 2810 + }, + { + "epoch": 0.5121906841339156, + "grad_norm": 0.17955681681632996, + "learning_rate": 2.527234449158228e-05, + "loss": 0.15546923875808716, + "step": 2815 + }, + { + "epoch": 0.5131004366812227, + "grad_norm": 0.163709819316864, + "learning_rate": 2.519873970997927e-05, + "loss": 0.15665037631988527, + "step": 2820 + }, + { + "epoch": 0.5140101892285298, + "grad_norm": 0.17859576642513275, + "learning_rate": 2.5125133205495405e-05, + "loss": 0.1539722204208374, + "step": 2825 + }, + { + "epoch": 0.514919941775837, + "grad_norm": 0.17443150281906128, + "learning_rate": 2.5051525616227806e-05, + "loss": 0.148411762714386, + "step": 2830 + }, + { + "epoch": 0.5158296943231441, + "grad_norm": 0.17397581040859222, + "learning_rate": 2.4977917580283007e-05, + "loss": 0.14880497455596925, + "step": 2835 + }, + { + "epoch": 0.5167394468704513, + "grad_norm": 0.14565663039684296, + "learning_rate": 2.4904309735771405e-05, + "loss": 0.14934680461883545, + "step": 2840 + }, + { + "epoch": 0.5176491994177583, + "grad_norm": 0.17895659804344177, + "learning_rate": 2.4830702720801746e-05, + "loss": 0.15287939310073853, + "step": 2845 + }, + { + "epoch": 0.5185589519650655, + "grad_norm": 0.15812788903713226, + "learning_rate": 2.4757097173475572e-05, + "loss": 0.14576947689056396, + "step": 2850 + }, + { + "epoch": 0.5194687045123726, + "grad_norm": 0.17123781144618988, + "learning_rate": 2.46834937318817e-05, + "loss": 0.15224847793579102, + "step": 2855 + }, + { + "epoch": 0.5203784570596798, + "grad_norm": 0.14845474064350128, + "learning_rate": 2.460989303409072e-05, + "loss": 0.14901585578918458, + "step": 2860 + }, + { + "epoch": 0.5212882096069869, + "grad_norm": 0.23493704199790955, + "learning_rate": 2.4536295718149407e-05, + "loss": 0.1517487049102783, + "step": 2865 + }, + { + "epoch": 0.522197962154294, + "grad_norm": 0.16209843754768372, + "learning_rate": 2.4462702422075217e-05, + "loss": 0.14327445030212402, + "step": 2870 + }, + { + "epoch": 0.5231077147016011, + "grad_norm": 0.17249803245067596, + "learning_rate": 2.4389113783850793e-05, + "loss": 0.1517549753189087, + "step": 2875 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14561402797698975, + "learning_rate": 2.431553044141836e-05, + "loss": 0.14764087200164794, + "step": 2880 + }, + { + "epoch": 0.5249272197962155, + "grad_norm": 0.17033302783966064, + "learning_rate": 2.4241953032674256e-05, + "loss": 0.15181604623794556, + "step": 2885 + }, + { + "epoch": 0.5258369723435226, + "grad_norm": 0.1184430941939354, + "learning_rate": 2.4168382195463367e-05, + "loss": 0.14264242649078368, + "step": 2890 + }, + { + "epoch": 0.5267467248908297, + "grad_norm": 0.17521196603775024, + "learning_rate": 2.4094818567573618e-05, + "loss": 0.1509538173675537, + "step": 2895 + }, + { + "epoch": 0.5276564774381368, + "grad_norm": 0.1681576371192932, + "learning_rate": 2.4021262786730428e-05, + "loss": 0.15344605445861817, + "step": 2900 + }, + { + "epoch": 0.528566229985444, + "grad_norm": 0.17134182155132294, + "learning_rate": 2.3947715490591206e-05, + "loss": 0.15161689519882202, + "step": 2905 + }, + { + "epoch": 0.5294759825327511, + "grad_norm": 0.1796472817659378, + "learning_rate": 2.3874177316739778e-05, + "loss": 0.15086464881896972, + "step": 2910 + }, + { + "epoch": 0.5303857350800583, + "grad_norm": 0.23268625140190125, + "learning_rate": 2.380064890268093e-05, + "loss": 0.15354180335998535, + "step": 2915 + }, + { + "epoch": 0.5312954876273653, + "grad_norm": 0.16318941116333008, + "learning_rate": 2.372713088583481e-05, + "loss": 0.15131797790527343, + "step": 2920 + }, + { + "epoch": 0.5322052401746725, + "grad_norm": 0.18171803653240204, + "learning_rate": 2.365362390353143e-05, + "loss": 0.15784090757369995, + "step": 2925 + }, + { + "epoch": 0.5331149927219796, + "grad_norm": 0.17672640085220337, + "learning_rate": 2.3580128593005156e-05, + "loss": 0.15509436130523682, + "step": 2930 + }, + { + "epoch": 0.5340247452692868, + "grad_norm": 0.15985223650932312, + "learning_rate": 2.3506645591389174e-05, + "loss": 0.14851027727127075, + "step": 2935 + }, + { + "epoch": 0.5349344978165939, + "grad_norm": 0.16597607731819153, + "learning_rate": 2.343317553570995e-05, + "loss": 0.1504931092262268, + "step": 2940 + }, + { + "epoch": 0.535844250363901, + "grad_norm": 0.20180748403072357, + "learning_rate": 2.3359719062881725e-05, + "loss": 0.15023820400238036, + "step": 2945 + }, + { + "epoch": 0.5367540029112081, + "grad_norm": 0.1735963076353073, + "learning_rate": 2.3286276809701e-05, + "loss": 0.15374408960342406, + "step": 2950 + }, + { + "epoch": 0.5376637554585153, + "grad_norm": 0.17629501223564148, + "learning_rate": 2.3212849412840995e-05, + "loss": 0.15007833242416382, + "step": 2955 + }, + { + "epoch": 0.5385735080058224, + "grad_norm": 0.1493796557188034, + "learning_rate": 2.3139437508846155e-05, + "loss": 0.15206656455993653, + "step": 2960 + }, + { + "epoch": 0.5394832605531296, + "grad_norm": 0.17426837980747223, + "learning_rate": 2.306604173412659e-05, + "loss": 0.1441131591796875, + "step": 2965 + }, + { + "epoch": 0.5403930131004366, + "grad_norm": 0.16984431445598602, + "learning_rate": 2.2992662724952613e-05, + "loss": 0.14438753128051757, + "step": 2970 + }, + { + "epoch": 0.5413027656477438, + "grad_norm": 0.1814386397600174, + "learning_rate": 2.2919301117449167e-05, + "loss": 0.14887022972106934, + "step": 2975 + }, + { + "epoch": 0.5422125181950509, + "grad_norm": 0.158392995595932, + "learning_rate": 2.2845957547590368e-05, + "loss": 0.14404361248016356, + "step": 2980 + }, + { + "epoch": 0.5431222707423581, + "grad_norm": 0.17496263980865479, + "learning_rate": 2.2772632651193953e-05, + "loss": 0.1454906702041626, + "step": 2985 + }, + { + "epoch": 0.5440320232896652, + "grad_norm": 0.157533198595047, + "learning_rate": 2.2699327063915766e-05, + "loss": 0.1458217740058899, + "step": 2990 + }, + { + "epoch": 0.5449417758369723, + "grad_norm": 0.1767890453338623, + "learning_rate": 2.262604142124427e-05, + "loss": 0.14384825229644777, + "step": 2995 + }, + { + "epoch": 0.5458515283842795, + "grad_norm": 0.1851050704717636, + "learning_rate": 2.2552776358495033e-05, + "loss": 0.14832457304000854, + "step": 3000 + }, + { + "epoch": 0.5467612809315866, + "grad_norm": 0.164175882935524, + "learning_rate": 2.247953251080521e-05, + "loss": 0.14999878406524658, + "step": 3005 + }, + { + "epoch": 0.5476710334788938, + "grad_norm": 0.3403675854206085, + "learning_rate": 2.240631051312804e-05, + "loss": 0.1443937063217163, + "step": 3010 + }, + { + "epoch": 0.5485807860262009, + "grad_norm": 0.16751109063625336, + "learning_rate": 2.2333111000227342e-05, + "loss": 0.1462402105331421, + "step": 3015 + }, + { + "epoch": 0.549490538573508, + "grad_norm": 0.14741151034832, + "learning_rate": 2.225993460667201e-05, + "loss": 0.149855899810791, + "step": 3020 + }, + { + "epoch": 0.5504002911208151, + "grad_norm": 0.20605266094207764, + "learning_rate": 2.218678196683054e-05, + "loss": 0.15413178205490113, + "step": 3025 + }, + { + "epoch": 0.5513100436681223, + "grad_norm": 0.14884796738624573, + "learning_rate": 2.2113653714865473e-05, + "loss": 0.14592334032058715, + "step": 3030 + }, + { + "epoch": 0.5522197962154294, + "grad_norm": 0.17114350199699402, + "learning_rate": 2.2040550484727943e-05, + "loss": 0.1498338460922241, + "step": 3035 + }, + { + "epoch": 0.5531295487627366, + "grad_norm": 0.16496853530406952, + "learning_rate": 2.196747291015219e-05, + "loss": 0.14650315046310425, + "step": 3040 + }, + { + "epoch": 0.5540393013100436, + "grad_norm": 0.15172401070594788, + "learning_rate": 2.189442162465001e-05, + "loss": 0.14984124898910522, + "step": 3045 + }, + { + "epoch": 0.5549490538573508, + "grad_norm": 0.19258467853069305, + "learning_rate": 2.182139726150532e-05, + "loss": 0.1486764669418335, + "step": 3050 + }, + { + "epoch": 0.5558588064046579, + "grad_norm": 0.1749001443386078, + "learning_rate": 2.1748400453768652e-05, + "loss": 0.14983701705932617, + "step": 3055 + }, + { + "epoch": 0.5567685589519651, + "grad_norm": 0.37510567903518677, + "learning_rate": 2.1675431834251637e-05, + "loss": 0.14483561515808105, + "step": 3060 + }, + { + "epoch": 0.5576783114992722, + "grad_norm": 0.16932405531406403, + "learning_rate": 2.1602492035521553e-05, + "loss": 0.14487643241882325, + "step": 3065 + }, + { + "epoch": 0.5585880640465793, + "grad_norm": 0.174176424741745, + "learning_rate": 2.152958168989584e-05, + "loss": 0.14737497568130492, + "step": 3070 + }, + { + "epoch": 0.5594978165938864, + "grad_norm": 0.1601252257823944, + "learning_rate": 2.1456701429436577e-05, + "loss": 0.15183379650115966, + "step": 3075 + }, + { + "epoch": 0.5604075691411936, + "grad_norm": 0.14960910379886627, + "learning_rate": 2.1383851885945085e-05, + "loss": 0.143074893951416, + "step": 3080 + }, + { + "epoch": 0.5613173216885007, + "grad_norm": 0.1678633838891983, + "learning_rate": 2.1311033690956346e-05, + "loss": 0.14961432218551635, + "step": 3085 + }, + { + "epoch": 0.5622270742358079, + "grad_norm": 0.15814319252967834, + "learning_rate": 2.1238247475733613e-05, + "loss": 0.14308581352233887, + "step": 3090 + }, + { + "epoch": 0.5631368267831149, + "grad_norm": 0.21240772306919098, + "learning_rate": 2.1165493871262887e-05, + "loss": 0.14737485647201537, + "step": 3095 + }, + { + "epoch": 0.5640465793304221, + "grad_norm": 0.15161271393299103, + "learning_rate": 2.109277350824749e-05, + "loss": 0.14534420967102052, + "step": 3100 + }, + { + "epoch": 0.5649563318777293, + "grad_norm": 0.16572362184524536, + "learning_rate": 2.1020087017102537e-05, + "loss": 0.14299670457839966, + "step": 3105 + }, + { + "epoch": 0.5658660844250364, + "grad_norm": 0.1548164039850235, + "learning_rate": 2.094743502794954e-05, + "loss": 0.14371142387390137, + "step": 3110 + }, + { + "epoch": 0.5667758369723436, + "grad_norm": 0.2574169933795929, + "learning_rate": 2.0874818170610885e-05, + "loss": 0.14350423812866211, + "step": 3115 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.16359548270702362, + "learning_rate": 2.080223707460443e-05, + "loss": 0.1520243763923645, + "step": 3120 + }, + { + "epoch": 0.5685953420669578, + "grad_norm": 0.1798320859670639, + "learning_rate": 2.072969236913799e-05, + "loss": 0.14832595586776734, + "step": 3125 + }, + { + "epoch": 0.5695050946142649, + "grad_norm": 0.17045916616916656, + "learning_rate": 2.0657184683103926e-05, + "loss": 0.15308042764663696, + "step": 3130 + }, + { + "epoch": 0.5704148471615721, + "grad_norm": 0.16345897316932678, + "learning_rate": 2.058471464507366e-05, + "loss": 0.14564799070358275, + "step": 3135 + }, + { + "epoch": 0.5713245997088792, + "grad_norm": 0.15170110762119293, + "learning_rate": 2.0512282883292257e-05, + "loss": 0.14161767959594726, + "step": 3140 + }, + { + "epoch": 0.5722343522561864, + "grad_norm": 0.8107472658157349, + "learning_rate": 2.0439890025672955e-05, + "loss": 0.14481087923049926, + "step": 3145 + }, + { + "epoch": 0.5731441048034934, + "grad_norm": 0.15346679091453552, + "learning_rate": 2.036753669979174e-05, + "loss": 0.14860262870788574, + "step": 3150 + }, + { + "epoch": 0.5740538573508006, + "grad_norm": 0.1632593423128128, + "learning_rate": 2.0295223532881886e-05, + "loss": 0.1481687307357788, + "step": 3155 + }, + { + "epoch": 0.5749636098981077, + "grad_norm": 0.23399172723293304, + "learning_rate": 2.022295115182852e-05, + "loss": 0.149153733253479, + "step": 3160 + }, + { + "epoch": 0.5758733624454149, + "grad_norm": 0.14977394044399261, + "learning_rate": 2.015072018316323e-05, + "loss": 0.14921388626098633, + "step": 3165 + }, + { + "epoch": 0.576783114992722, + "grad_norm": 0.1550658792257309, + "learning_rate": 2.007853125305856e-05, + "loss": 0.1482759475708008, + "step": 3170 + }, + { + "epoch": 0.5776928675400291, + "grad_norm": 0.16661737859249115, + "learning_rate": 2.0006384987322645e-05, + "loss": 0.14903552532196046, + "step": 3175 + }, + { + "epoch": 0.5786026200873362, + "grad_norm": 0.1746823936700821, + "learning_rate": 1.9934282011393753e-05, + "loss": 0.1412947654724121, + "step": 3180 + }, + { + "epoch": 0.5795123726346434, + "grad_norm": 0.17025792598724365, + "learning_rate": 1.9862222950334857e-05, + "loss": 0.15289769172668458, + "step": 3185 + }, + { + "epoch": 0.5804221251819505, + "grad_norm": 0.16857658326625824, + "learning_rate": 1.9790208428828252e-05, + "loss": 0.14419941902160643, + "step": 3190 + }, + { + "epoch": 0.5813318777292577, + "grad_norm": 0.16099876165390015, + "learning_rate": 1.9718239071170118e-05, + "loss": 0.14476487636566163, + "step": 3195 + }, + { + "epoch": 0.5822416302765647, + "grad_norm": 0.16140873730182648, + "learning_rate": 1.964631550126508e-05, + "loss": 0.14588416814804078, + "step": 3200 + }, + { + "epoch": 0.5831513828238719, + "grad_norm": 0.15719448029994965, + "learning_rate": 1.957443834262087e-05, + "loss": 0.15144693851470947, + "step": 3205 + }, + { + "epoch": 0.584061135371179, + "grad_norm": 0.16512645781040192, + "learning_rate": 1.950260821834285e-05, + "loss": 0.14787566661834717, + "step": 3210 + }, + { + "epoch": 0.5849708879184862, + "grad_norm": 0.18584516644477844, + "learning_rate": 1.9430825751128643e-05, + "loss": 0.14514710903167724, + "step": 3215 + }, + { + "epoch": 0.5858806404657934, + "grad_norm": 0.17640981078147888, + "learning_rate": 1.9359091563262742e-05, + "loss": 0.1511004686355591, + "step": 3220 + }, + { + "epoch": 0.5867903930131004, + "grad_norm": 0.1697624921798706, + "learning_rate": 1.9287406276611095e-05, + "loss": 0.15392563343048096, + "step": 3225 + }, + { + "epoch": 0.5877001455604076, + "grad_norm": 0.1677260845899582, + "learning_rate": 1.9215770512615725e-05, + "loss": 0.15311745405197144, + "step": 3230 + }, + { + "epoch": 0.5886098981077147, + "grad_norm": 0.15357480943202972, + "learning_rate": 1.9144184892289337e-05, + "loss": 0.14370160102844237, + "step": 3235 + }, + { + "epoch": 0.5895196506550219, + "grad_norm": 0.18601207435131073, + "learning_rate": 1.9072650036209955e-05, + "loss": 0.14095077514648438, + "step": 3240 + }, + { + "epoch": 0.590429403202329, + "grad_norm": 0.17313526570796967, + "learning_rate": 1.9001166564515513e-05, + "loss": 0.148259174823761, + "step": 3245 + }, + { + "epoch": 0.5913391557496361, + "grad_norm": 0.1634378433227539, + "learning_rate": 1.8929735096898504e-05, + "loss": 0.15082294940948487, + "step": 3250 + }, + { + "epoch": 0.5922489082969432, + "grad_norm": 0.18542174994945526, + "learning_rate": 1.885835625260058e-05, + "loss": 0.14461435079574586, + "step": 3255 + }, + { + "epoch": 0.5931586608442504, + "grad_norm": 0.1740756630897522, + "learning_rate": 1.87870306504072e-05, + "loss": 0.14083608388900756, + "step": 3260 + }, + { + "epoch": 0.5940684133915575, + "grad_norm": 0.25606217980384827, + "learning_rate": 1.8715758908642288e-05, + "loss": 0.15125386714935302, + "step": 3265 + }, + { + "epoch": 0.5949781659388647, + "grad_norm": 0.20194627344608307, + "learning_rate": 1.8644541645162834e-05, + "loss": 0.14433003664016725, + "step": 3270 + }, + { + "epoch": 0.5958879184861717, + "grad_norm": 0.1902168095111847, + "learning_rate": 1.8573379477353542e-05, + "loss": 0.14718132019042968, + "step": 3275 + }, + { + "epoch": 0.5967976710334789, + "grad_norm": 0.15122972428798676, + "learning_rate": 1.850227302212151e-05, + "loss": 0.153376567363739, + "step": 3280 + }, + { + "epoch": 0.597707423580786, + "grad_norm": 0.14331959187984467, + "learning_rate": 1.843122289589085e-05, + "loss": 0.146630597114563, + "step": 3285 + }, + { + "epoch": 0.5986171761280932, + "grad_norm": 0.15083099901676178, + "learning_rate": 1.836022971459737e-05, + "loss": 0.1445971965789795, + "step": 3290 + }, + { + "epoch": 0.5995269286754003, + "grad_norm": 0.16585418581962585, + "learning_rate": 1.828929409368321e-05, + "loss": 0.15120241641998292, + "step": 3295 + }, + { + "epoch": 0.6004366812227074, + "grad_norm": 0.1653224229812622, + "learning_rate": 1.8218416648091524e-05, + "loss": 0.14349838495254516, + "step": 3300 + }, + { + "epoch": 0.6013464337700145, + "grad_norm": 0.1891375184059143, + "learning_rate": 1.8147597992261124e-05, + "loss": 0.15171384811401367, + "step": 3305 + }, + { + "epoch": 0.6022561863173217, + "grad_norm": 0.13392704725265503, + "learning_rate": 1.8076838740121187e-05, + "loss": 0.14607118368148803, + "step": 3310 + }, + { + "epoch": 0.6031659388646288, + "grad_norm": 0.15421944856643677, + "learning_rate": 1.8006139505085926e-05, + "loss": 0.1380957007408142, + "step": 3315 + }, + { + "epoch": 0.604075691411936, + "grad_norm": 0.16637761890888214, + "learning_rate": 1.7935500900049246e-05, + "loss": 0.14604611396789552, + "step": 3320 + }, + { + "epoch": 0.6049854439592431, + "grad_norm": 0.16638441383838654, + "learning_rate": 1.7864923537379445e-05, + "loss": 0.1513611912727356, + "step": 3325 + }, + { + "epoch": 0.6058951965065502, + "grad_norm": 0.1745707094669342, + "learning_rate": 1.779440802891394e-05, + "loss": 0.15391240119934083, + "step": 3330 + }, + { + "epoch": 0.6068049490538574, + "grad_norm": 0.1620505005121231, + "learning_rate": 1.77239549859539e-05, + "loss": 0.14986472129821776, + "step": 3335 + }, + { + "epoch": 0.6077147016011645, + "grad_norm": 0.1579132080078125, + "learning_rate": 1.7653565019259e-05, + "loss": 0.1466603994369507, + "step": 3340 + }, + { + "epoch": 0.6086244541484717, + "grad_norm": 0.19154994189739227, + "learning_rate": 1.7583238739042086e-05, + "loss": 0.15228934288024903, + "step": 3345 + }, + { + "epoch": 0.6095342066957787, + "grad_norm": 0.15771779417991638, + "learning_rate": 1.7512976754963913e-05, + "loss": 0.14965078830718995, + "step": 3350 + }, + { + "epoch": 0.6104439592430859, + "grad_norm": 0.18406136333942413, + "learning_rate": 1.744277967612785e-05, + "loss": 0.1473196864128113, + "step": 3355 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.17603816092014313, + "learning_rate": 1.7372648111074607e-05, + "loss": 0.1430676221847534, + "step": 3360 + }, + { + "epoch": 0.6122634643377002, + "grad_norm": 0.156408429145813, + "learning_rate": 1.7302582667776933e-05, + "loss": 0.14018454551696777, + "step": 3365 + }, + { + "epoch": 0.6131732168850073, + "grad_norm": 0.14504843950271606, + "learning_rate": 1.7232583953634407e-05, + "loss": 0.14505640268325806, + "step": 3370 + }, + { + "epoch": 0.6140829694323144, + "grad_norm": 0.1864968240261078, + "learning_rate": 1.716265257546808e-05, + "loss": 0.14810394048690795, + "step": 3375 + }, + { + "epoch": 0.6149927219796215, + "grad_norm": 0.1621711403131485, + "learning_rate": 1.7092789139515295e-05, + "loss": 0.14203091859817504, + "step": 3380 + }, + { + "epoch": 0.6159024745269287, + "grad_norm": 0.17994914948940277, + "learning_rate": 1.70229942514244e-05, + "loss": 0.14565644264221192, + "step": 3385 + }, + { + "epoch": 0.6168122270742358, + "grad_norm": 0.1707388162612915, + "learning_rate": 1.6953268516249486e-05, + "loss": 0.14449434280395507, + "step": 3390 + }, + { + "epoch": 0.617721979621543, + "grad_norm": 0.16425329446792603, + "learning_rate": 1.6883612538445175e-05, + "loss": 0.15185940265655518, + "step": 3395 + }, + { + "epoch": 0.61863173216885, + "grad_norm": 0.15987788140773773, + "learning_rate": 1.6814026921861335e-05, + "loss": 0.14994431734085084, + "step": 3400 + }, + { + "epoch": 0.6195414847161572, + "grad_norm": 0.2987690269947052, + "learning_rate": 1.6744512269737894e-05, + "loss": 0.14652738571166993, + "step": 3405 + }, + { + "epoch": 0.6204512372634643, + "grad_norm": 0.1681315004825592, + "learning_rate": 1.6675069184699574e-05, + "loss": 0.14566165208816528, + "step": 3410 + }, + { + "epoch": 0.6213609898107715, + "grad_norm": 0.15847846865653992, + "learning_rate": 1.660569826875069e-05, + "loss": 0.1374401330947876, + "step": 3415 + }, + { + "epoch": 0.6222707423580786, + "grad_norm": 0.16370312869548798, + "learning_rate": 1.6536400123269907e-05, + "loss": 0.14905524253845215, + "step": 3420 + }, + { + "epoch": 0.6231804949053857, + "grad_norm": 0.16054444015026093, + "learning_rate": 1.6467175349005054e-05, + "loss": 0.1496324896812439, + "step": 3425 + }, + { + "epoch": 0.6240902474526928, + "grad_norm": 0.1663951277732849, + "learning_rate": 1.639802454606788e-05, + "loss": 0.1504170298576355, + "step": 3430 + }, + { + "epoch": 0.625, + "grad_norm": 0.1591310054063797, + "learning_rate": 1.6328948313928906e-05, + "loss": 0.1410186171531677, + "step": 3435 + }, + { + "epoch": 0.6259097525473072, + "grad_norm": 0.1637524962425232, + "learning_rate": 1.6259947251412178e-05, + "loss": 0.13963305950164795, + "step": 3440 + }, + { + "epoch": 0.6268195050946143, + "grad_norm": 0.1688017100095749, + "learning_rate": 1.6191021956690096e-05, + "loss": 0.14727941751480103, + "step": 3445 + }, + { + "epoch": 0.6277292576419214, + "grad_norm": 0.1691795438528061, + "learning_rate": 1.612217302727821e-05, + "loss": 0.14856183528900146, + "step": 3450 + }, + { + "epoch": 0.6286390101892285, + "grad_norm": 0.18501746654510498, + "learning_rate": 1.60534010600301e-05, + "loss": 0.1481746554374695, + "step": 3455 + }, + { + "epoch": 0.6295487627365357, + "grad_norm": 0.16234716773033142, + "learning_rate": 1.5984706651132125e-05, + "loss": 0.1427530527114868, + "step": 3460 + }, + { + "epoch": 0.6304585152838428, + "grad_norm": 0.16013780236244202, + "learning_rate": 1.5916090396098293e-05, + "loss": 0.14264426231384278, + "step": 3465 + }, + { + "epoch": 0.63136826783115, + "grad_norm": 0.17116396129131317, + "learning_rate": 1.5847552889765095e-05, + "loss": 0.14109257459640503, + "step": 3470 + }, + { + "epoch": 0.632278020378457, + "grad_norm": 0.16949769854545593, + "learning_rate": 1.5779094726286344e-05, + "loss": 0.1387040376663208, + "step": 3475 + }, + { + "epoch": 0.6331877729257642, + "grad_norm": 0.14983431994915009, + "learning_rate": 1.5710716499128044e-05, + "loss": 0.13645120859146118, + "step": 3480 + }, + { + "epoch": 0.6340975254730713, + "grad_norm": 0.1632554531097412, + "learning_rate": 1.564241880106321e-05, + "loss": 0.14883992671966553, + "step": 3485 + }, + { + "epoch": 0.6350072780203785, + "grad_norm": 0.15686506032943726, + "learning_rate": 1.5574202224166744e-05, + "loss": 0.14244272708892822, + "step": 3490 + }, + { + "epoch": 0.6359170305676856, + "grad_norm": 0.18843458592891693, + "learning_rate": 1.5506067359810333e-05, + "loss": 0.15149861574172974, + "step": 3495 + }, + { + "epoch": 0.6368267831149927, + "grad_norm": 0.15874551236629486, + "learning_rate": 1.5438014798657275e-05, + "loss": 0.15188233852386473, + "step": 3500 + }, + { + "epoch": 0.6377365356622998, + "grad_norm": 0.17014239728450775, + "learning_rate": 1.5370045130657366e-05, + "loss": 0.14694437980651856, + "step": 3505 + }, + { + "epoch": 0.638646288209607, + "grad_norm": 0.14744038879871368, + "learning_rate": 1.5302158945041838e-05, + "loss": 0.14434736967086792, + "step": 3510 + }, + { + "epoch": 0.6395560407569141, + "grad_norm": 0.2069770246744156, + "learning_rate": 1.523435683031818e-05, + "loss": 0.13982917070388795, + "step": 3515 + }, + { + "epoch": 0.6404657933042213, + "grad_norm": 0.17811502516269684, + "learning_rate": 1.5166639374265063e-05, + "loss": 0.1408839702606201, + "step": 3520 + }, + { + "epoch": 0.6413755458515283, + "grad_norm": 0.165786474943161, + "learning_rate": 1.509900716392728e-05, + "loss": 0.15312877893447877, + "step": 3525 + }, + { + "epoch": 0.6422852983988355, + "grad_norm": 0.1633884161710739, + "learning_rate": 1.5031460785610596e-05, + "loss": 0.1488795518875122, + "step": 3530 + }, + { + "epoch": 0.6431950509461426, + "grad_norm": 0.16498984396457672, + "learning_rate": 1.4964000824876723e-05, + "loss": 0.15031465291976928, + "step": 3535 + }, + { + "epoch": 0.6441048034934498, + "grad_norm": 0.18043678998947144, + "learning_rate": 1.4896627866538191e-05, + "loss": 0.147829806804657, + "step": 3540 + }, + { + "epoch": 0.6450145560407569, + "grad_norm": 0.16813597083091736, + "learning_rate": 1.4829342494653315e-05, + "loss": 0.1418998956680298, + "step": 3545 + }, + { + "epoch": 0.645924308588064, + "grad_norm": 0.1817242056131363, + "learning_rate": 1.4762145292521118e-05, + "loss": 0.14508869647979736, + "step": 3550 + }, + { + "epoch": 0.6468340611353712, + "grad_norm": 0.14666494727134705, + "learning_rate": 1.469503684267628e-05, + "loss": 0.14159854650497436, + "step": 3555 + }, + { + "epoch": 0.6477438136826783, + "grad_norm": 0.16485381126403809, + "learning_rate": 1.4628017726884086e-05, + "loss": 0.14419105052947997, + "step": 3560 + }, + { + "epoch": 0.6486535662299855, + "grad_norm": 0.16100342571735382, + "learning_rate": 1.4561088526135375e-05, + "loss": 0.14501721858978273, + "step": 3565 + }, + { + "epoch": 0.6495633187772926, + "grad_norm": 0.16996590793132782, + "learning_rate": 1.4494249820641493e-05, + "loss": 0.1377166509628296, + "step": 3570 + }, + { + "epoch": 0.6504730713245997, + "grad_norm": 0.16168837249279022, + "learning_rate": 1.4427502189829339e-05, + "loss": 0.1414325475692749, + "step": 3575 + }, + { + "epoch": 0.6513828238719068, + "grad_norm": 0.16318906843662262, + "learning_rate": 1.436084621233621e-05, + "loss": 0.14685193300247193, + "step": 3580 + }, + { + "epoch": 0.652292576419214, + "grad_norm": 0.1636219322681427, + "learning_rate": 1.4294282466004899e-05, + "loss": 0.1405899167060852, + "step": 3585 + }, + { + "epoch": 0.6532023289665211, + "grad_norm": 0.1838461309671402, + "learning_rate": 1.422781152787865e-05, + "loss": 0.14386332035064697, + "step": 3590 + }, + { + "epoch": 0.6541120815138283, + "grad_norm": 0.1796344667673111, + "learning_rate": 1.4161433974196115e-05, + "loss": 0.1513024687767029, + "step": 3595 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.16424529254436493, + "learning_rate": 1.4095150380386427e-05, + "loss": 0.14238927364349366, + "step": 3600 + }, + { + "epoch": 0.6559315866084425, + "grad_norm": 0.19264160096645355, + "learning_rate": 1.402896132106415e-05, + "loss": 0.14297477006912232, + "step": 3605 + }, + { + "epoch": 0.6568413391557496, + "grad_norm": 0.18319948017597198, + "learning_rate": 1.3962867370024347e-05, + "loss": 0.1448880434036255, + "step": 3610 + }, + { + "epoch": 0.6577510917030568, + "grad_norm": 0.16507290303707123, + "learning_rate": 1.389686910023758e-05, + "loss": 0.14724698066711425, + "step": 3615 + }, + { + "epoch": 0.6586608442503639, + "grad_norm": 0.17871244251728058, + "learning_rate": 1.3830967083844942e-05, + "loss": 0.14479386806488037, + "step": 3620 + }, + { + "epoch": 0.659570596797671, + "grad_norm": 0.1846228390932083, + "learning_rate": 1.3765161892153112e-05, + "loss": 0.1453616738319397, + "step": 3625 + }, + { + "epoch": 0.6604803493449781, + "grad_norm": 0.17185978591442108, + "learning_rate": 1.3699454095629372e-05, + "loss": 0.14906206130981445, + "step": 3630 + }, + { + "epoch": 0.6613901018922853, + "grad_norm": 0.14751191437244415, + "learning_rate": 1.3633844263896698e-05, + "loss": 0.13991892337799072, + "step": 3635 + }, + { + "epoch": 0.6622998544395924, + "grad_norm": 0.22059763967990875, + "learning_rate": 1.3568332965728817e-05, + "loss": 0.14680869579315187, + "step": 3640 + }, + { + "epoch": 0.6632096069868996, + "grad_norm": 0.15295909345149994, + "learning_rate": 1.3502920769045232e-05, + "loss": 0.1404443383216858, + "step": 3645 + }, + { + "epoch": 0.6641193595342066, + "grad_norm": 0.14600558578968048, + "learning_rate": 1.3437608240906364e-05, + "loss": 0.14663270711898804, + "step": 3650 + }, + { + "epoch": 0.6650291120815138, + "grad_norm": 0.15548352897167206, + "learning_rate": 1.3372395947508587e-05, + "loss": 0.1431443452835083, + "step": 3655 + }, + { + "epoch": 0.665938864628821, + "grad_norm": 0.1813388466835022, + "learning_rate": 1.3307284454179342e-05, + "loss": 0.1458706736564636, + "step": 3660 + }, + { + "epoch": 0.6668486171761281, + "grad_norm": 0.16326870024204254, + "learning_rate": 1.3242274325372247e-05, + "loss": 0.14700595140457154, + "step": 3665 + }, + { + "epoch": 0.6677583697234353, + "grad_norm": 0.18779197335243225, + "learning_rate": 1.3177366124662149e-05, + "loss": 0.1497237801551819, + "step": 3670 + }, + { + "epoch": 0.6686681222707423, + "grad_norm": 0.16291002929210663, + "learning_rate": 1.3112560414740315e-05, + "loss": 0.1387086868286133, + "step": 3675 + }, + { + "epoch": 0.6695778748180495, + "grad_norm": 0.1532297134399414, + "learning_rate": 1.3047857757409487e-05, + "loss": 0.14497545957565308, + "step": 3680 + }, + { + "epoch": 0.6704876273653566, + "grad_norm": 0.14697515964508057, + "learning_rate": 1.2983258713579066e-05, + "loss": 0.1494283437728882, + "step": 3685 + }, + { + "epoch": 0.6713973799126638, + "grad_norm": 0.15213452279567719, + "learning_rate": 1.2918763843260218e-05, + "loss": 0.1468907594680786, + "step": 3690 + }, + { + "epoch": 0.6723071324599709, + "grad_norm": 0.1745215803384781, + "learning_rate": 1.285437370556099e-05, + "loss": 0.14997754096984864, + "step": 3695 + }, + { + "epoch": 0.673216885007278, + "grad_norm": 0.19207637012004852, + "learning_rate": 1.2790088858681577e-05, + "loss": 0.14202862977981567, + "step": 3700 + }, + { + "epoch": 0.6741266375545851, + "grad_norm": 0.1521359086036682, + "learning_rate": 1.2725909859909313e-05, + "loss": 0.14547673463821412, + "step": 3705 + }, + { + "epoch": 0.6750363901018923, + "grad_norm": 0.16975535452365875, + "learning_rate": 1.2661837265613999e-05, + "loss": 0.14006874561309815, + "step": 3710 + }, + { + "epoch": 0.6759461426491994, + "grad_norm": 0.22234582901000977, + "learning_rate": 1.2597871631242992e-05, + "loss": 0.13691173791885375, + "step": 3715 + }, + { + "epoch": 0.6768558951965066, + "grad_norm": 0.16082969307899475, + "learning_rate": 1.2534013511316383e-05, + "loss": 0.14932308197021485, + "step": 3720 + }, + { + "epoch": 0.6777656477438136, + "grad_norm": 0.1751091182231903, + "learning_rate": 1.247026345942226e-05, + "loss": 0.14531974792480468, + "step": 3725 + }, + { + "epoch": 0.6786754002911208, + "grad_norm": 0.15838147699832916, + "learning_rate": 1.2406622028211844e-05, + "loss": 0.14759832620620728, + "step": 3730 + }, + { + "epoch": 0.6795851528384279, + "grad_norm": 0.1771744042634964, + "learning_rate": 1.2343089769394714e-05, + "loss": 0.1382831573486328, + "step": 3735 + }, + { + "epoch": 0.6804949053857351, + "grad_norm": 0.16301538050174713, + "learning_rate": 1.2279667233734037e-05, + "loss": 0.14444775581359864, + "step": 3740 + }, + { + "epoch": 0.6814046579330422, + "grad_norm": 0.1584121286869049, + "learning_rate": 1.2216354971041796e-05, + "loss": 0.14200170040130616, + "step": 3745 + }, + { + "epoch": 0.6823144104803494, + "grad_norm": 0.139187291264534, + "learning_rate": 1.2153153530174007e-05, + "loss": 0.14318310022354125, + "step": 3750 + }, + { + "epoch": 0.6832241630276564, + "grad_norm": 0.13665248453617096, + "learning_rate": 1.2090063459025955e-05, + "loss": 0.1411946654319763, + "step": 3755 + }, + { + "epoch": 0.6841339155749636, + "grad_norm": 0.16273781657218933, + "learning_rate": 1.2027085304527475e-05, + "loss": 0.14873508214950562, + "step": 3760 + }, + { + "epoch": 0.6850436681222707, + "grad_norm": 0.16317526996135712, + "learning_rate": 1.1964219612638194e-05, + "loss": 0.14644203186035157, + "step": 3765 + }, + { + "epoch": 0.6859534206695779, + "grad_norm": 0.17253617942333221, + "learning_rate": 1.1901466928342777e-05, + "loss": 0.14027841091156007, + "step": 3770 + }, + { + "epoch": 0.6868631732168851, + "grad_norm": 0.19692830741405487, + "learning_rate": 1.183882779564624e-05, + "loss": 0.14411110877990724, + "step": 3775 + }, + { + "epoch": 0.6877729257641921, + "grad_norm": 0.15444578230381012, + "learning_rate": 1.1776302757569214e-05, + "loss": 0.14355008602142333, + "step": 3780 + }, + { + "epoch": 0.6886826783114993, + "grad_norm": 0.1622200757265091, + "learning_rate": 1.1713892356143239e-05, + "loss": 0.14794334173202514, + "step": 3785 + }, + { + "epoch": 0.6895924308588064, + "grad_norm": 0.1898501068353653, + "learning_rate": 1.1651597132406073e-05, + "loss": 0.1418622612953186, + "step": 3790 + }, + { + "epoch": 0.6905021834061136, + "grad_norm": 0.17803208529949188, + "learning_rate": 1.1589417626396973e-05, + "loss": 0.14576040506362914, + "step": 3795 + }, + { + "epoch": 0.6914119359534207, + "grad_norm": 0.17138013243675232, + "learning_rate": 1.1527354377152053e-05, + "loss": 0.14494270086288452, + "step": 3800 + }, + { + "epoch": 0.6923216885007278, + "grad_norm": 0.15170913934707642, + "learning_rate": 1.1465407922699603e-05, + "loss": 0.144084370136261, + "step": 3805 + }, + { + "epoch": 0.6932314410480349, + "grad_norm": 0.158562570810318, + "learning_rate": 1.1403578800055387e-05, + "loss": 0.13636608123779298, + "step": 3810 + }, + { + "epoch": 0.6941411935953421, + "grad_norm": 0.17687302827835083, + "learning_rate": 1.1341867545218044e-05, + "loss": 0.14214688539505005, + "step": 3815 + }, + { + "epoch": 0.6950509461426492, + "grad_norm": 0.15394899249076843, + "learning_rate": 1.1280274693164378e-05, + "loss": 0.14914129972457885, + "step": 3820 + }, + { + "epoch": 0.6959606986899564, + "grad_norm": 0.15709355473518372, + "learning_rate": 1.12188007778448e-05, + "loss": 0.14798580408096312, + "step": 3825 + }, + { + "epoch": 0.6968704512372634, + "grad_norm": 0.16631539165973663, + "learning_rate": 1.115744633217864e-05, + "loss": 0.14756966829299928, + "step": 3830 + }, + { + "epoch": 0.6977802037845706, + "grad_norm": 0.15893076360225677, + "learning_rate": 1.109621188804951e-05, + "loss": 0.14061959981918334, + "step": 3835 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.183414489030838, + "learning_rate": 1.103509797630077e-05, + "loss": 0.1448473334312439, + "step": 3840 + }, + { + "epoch": 0.6995997088791849, + "grad_norm": 0.14087305963039398, + "learning_rate": 1.0974105126730841e-05, + "loss": 0.14369285106658936, + "step": 3845 + }, + { + "epoch": 0.700509461426492, + "grad_norm": 0.16919967532157898, + "learning_rate": 1.0913233868088685e-05, + "loss": 0.1478085398674011, + "step": 3850 + }, + { + "epoch": 0.7014192139737991, + "grad_norm": 0.1439533829689026, + "learning_rate": 1.0852484728069178e-05, + "loss": 0.14376721382141114, + "step": 3855 + }, + { + "epoch": 0.7023289665211062, + "grad_norm": 0.17719274759292603, + "learning_rate": 1.0791858233308521e-05, + "loss": 0.14089040756225585, + "step": 3860 + }, + { + "epoch": 0.7032387190684134, + "grad_norm": 0.19753769040107727, + "learning_rate": 1.0731354909379754e-05, + "loss": 0.15021742582321168, + "step": 3865 + }, + { + "epoch": 0.7041484716157205, + "grad_norm": 0.19186992943286896, + "learning_rate": 1.0670975280788086e-05, + "loss": 0.14113202095031738, + "step": 3870 + }, + { + "epoch": 0.7050582241630277, + "grad_norm": 0.1709229201078415, + "learning_rate": 1.0610719870966443e-05, + "loss": 0.1500566840171814, + "step": 3875 + }, + { + "epoch": 0.7059679767103348, + "grad_norm": 0.17846204340457916, + "learning_rate": 1.0550589202270892e-05, + "loss": 0.15014195442199707, + "step": 3880 + }, + { + "epoch": 0.7068777292576419, + "grad_norm": 0.1827082335948944, + "learning_rate": 1.0490583795976091e-05, + "loss": 0.1423472762107849, + "step": 3885 + }, + { + "epoch": 0.7077874818049491, + "grad_norm": 0.17418377101421356, + "learning_rate": 1.043070417227083e-05, + "loss": 0.14668900966644288, + "step": 3890 + }, + { + "epoch": 0.7086972343522562, + "grad_norm": 0.17385616898536682, + "learning_rate": 1.0370950850253449e-05, + "loss": 0.14627279043197633, + "step": 3895 + }, + { + "epoch": 0.7096069868995634, + "grad_norm": 0.16486723721027374, + "learning_rate": 1.0311324347927404e-05, + "loss": 0.14603652954101562, + "step": 3900 + }, + { + "epoch": 0.7105167394468704, + "grad_norm": 0.21806862950325012, + "learning_rate": 1.0251825182196732e-05, + "loss": 0.1488169550895691, + "step": 3905 + }, + { + "epoch": 0.7114264919941776, + "grad_norm": 0.19884569942951202, + "learning_rate": 1.019245386886159e-05, + "loss": 0.14387656450271608, + "step": 3910 + }, + { + "epoch": 0.7123362445414847, + "grad_norm": 0.16139011085033417, + "learning_rate": 1.0133210922613789e-05, + "loss": 0.1483074426651001, + "step": 3915 + }, + { + "epoch": 0.7132459970887919, + "grad_norm": 0.17000740766525269, + "learning_rate": 1.007409685703229e-05, + "loss": 0.14050065279006957, + "step": 3920 + }, + { + "epoch": 0.714155749636099, + "grad_norm": 0.17235304415225983, + "learning_rate": 1.0015112184578813e-05, + "loss": 0.1440442681312561, + "step": 3925 + }, + { + "epoch": 0.7150655021834061, + "grad_norm": 0.15737567842006683, + "learning_rate": 9.956257416593362e-06, + "loss": 0.14960765838623047, + "step": 3930 + }, + { + "epoch": 0.7159752547307132, + "grad_norm": 0.15499180555343628, + "learning_rate": 9.897533063289773e-06, + "loss": 0.14488829374313356, + "step": 3935 + }, + { + "epoch": 0.7168850072780204, + "grad_norm": 0.17744216322898865, + "learning_rate": 9.838939633751337e-06, + "loss": 0.1416949987411499, + "step": 3940 + }, + { + "epoch": 0.7177947598253275, + "grad_norm": 0.1597192883491516, + "learning_rate": 9.780477635926358e-06, + "loss": 0.14275280237197877, + "step": 3945 + }, + { + "epoch": 0.7187045123726347, + "grad_norm": 0.17800374329090118, + "learning_rate": 9.722147576623743e-06, + "loss": 0.14532098770141602, + "step": 3950 + }, + { + "epoch": 0.7196142649199417, + "grad_norm": 0.1828162521123886, + "learning_rate": 9.66394996150864e-06, + "loss": 0.14525585174560546, + "step": 3955 + }, + { + "epoch": 0.7205240174672489, + "grad_norm": 0.1800539344549179, + "learning_rate": 9.605885295098005e-06, + "loss": 0.14235819578170777, + "step": 3960 + }, + { + "epoch": 0.721433770014556, + "grad_norm": 0.16556483507156372, + "learning_rate": 9.54795408075628e-06, + "loss": 0.13965482711791993, + "step": 3965 + }, + { + "epoch": 0.7223435225618632, + "grad_norm": 0.1592024862766266, + "learning_rate": 9.49015682069101e-06, + "loss": 0.14051042795181273, + "step": 3970 + }, + { + "epoch": 0.7232532751091703, + "grad_norm": 0.18988847732543945, + "learning_rate": 9.43249401594846e-06, + "loss": 0.1436900496482849, + "step": 3975 + }, + { + "epoch": 0.7241630276564774, + "grad_norm": 0.24433808028697968, + "learning_rate": 9.374966166409329e-06, + "loss": 0.14883997440338134, + "step": 3980 + }, + { + "epoch": 0.7250727802037845, + "grad_norm": 0.15091639757156372, + "learning_rate": 9.317573770784352e-06, + "loss": 0.14726560115814208, + "step": 3985 + }, + { + "epoch": 0.7259825327510917, + "grad_norm": 0.17045573890209198, + "learning_rate": 9.260317326610051e-06, + "loss": 0.14120506048202514, + "step": 3990 + }, + { + "epoch": 0.7268922852983989, + "grad_norm": 0.18847957253456116, + "learning_rate": 9.203197330244343e-06, + "loss": 0.1377041220664978, + "step": 3995 + }, + { + "epoch": 0.727802037845706, + "grad_norm": 0.1516445279121399, + "learning_rate": 9.14621427686229e-06, + "loss": 0.14043946266174318, + "step": 4000 + }, + { + "epoch": 0.7287117903930131, + "grad_norm": 0.18264050781726837, + "learning_rate": 9.0893686604518e-06, + "loss": 0.14080368280410765, + "step": 4005 + }, + { + "epoch": 0.7296215429403202, + "grad_norm": 0.19129371643066406, + "learning_rate": 9.032660973809312e-06, + "loss": 0.1402561902999878, + "step": 4010 + }, + { + "epoch": 0.7305312954876274, + "grad_norm": 0.15762710571289062, + "learning_rate": 8.976091708535567e-06, + "loss": 0.14421157836914061, + "step": 4015 + }, + { + "epoch": 0.7314410480349345, + "grad_norm": 0.17785198986530304, + "learning_rate": 8.919661355031331e-06, + "loss": 0.14999009370803834, + "step": 4020 + }, + { + "epoch": 0.7323508005822417, + "grad_norm": 0.15306031703948975, + "learning_rate": 8.8633704024931e-06, + "loss": 0.14101698398590087, + "step": 4025 + }, + { + "epoch": 0.7332605531295487, + "grad_norm": 0.16481758654117584, + "learning_rate": 8.807219338908968e-06, + "loss": 0.14170764684677123, + "step": 4030 + }, + { + "epoch": 0.7341703056768559, + "grad_norm": 0.14892235398292542, + "learning_rate": 8.751208651054257e-06, + "loss": 0.15317896604537964, + "step": 4035 + }, + { + "epoch": 0.735080058224163, + "grad_norm": 0.1775592565536499, + "learning_rate": 8.695338824487409e-06, + "loss": 0.1520617723464966, + "step": 4040 + }, + { + "epoch": 0.7359898107714702, + "grad_norm": 0.1614258885383606, + "learning_rate": 8.639610343545728e-06, + "loss": 0.13747400045394897, + "step": 4045 + }, + { + "epoch": 0.7368995633187773, + "grad_norm": 0.21415506303310394, + "learning_rate": 8.58402369134117e-06, + "loss": 0.1432439088821411, + "step": 4050 + }, + { + "epoch": 0.7378093158660844, + "grad_norm": 0.1759418249130249, + "learning_rate": 8.528579349756205e-06, + "loss": 0.141641104221344, + "step": 4055 + }, + { + "epoch": 0.7387190684133915, + "grad_norm": 0.16738329827785492, + "learning_rate": 8.47327779943957e-06, + "loss": 0.14294810295104982, + "step": 4060 + }, + { + "epoch": 0.7396288209606987, + "grad_norm": 0.13916844129562378, + "learning_rate": 8.41811951980217e-06, + "loss": 0.13876968622207642, + "step": 4065 + }, + { + "epoch": 0.7405385735080058, + "grad_norm": 0.1828441321849823, + "learning_rate": 8.36310498901288e-06, + "loss": 0.148428475856781, + "step": 4070 + }, + { + "epoch": 0.741448326055313, + "grad_norm": 0.16534076631069183, + "learning_rate": 8.308234683994415e-06, + "loss": 0.14222711324691772, + "step": 4075 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.17922644317150116, + "learning_rate": 8.253509080419198e-06, + "loss": 0.14365782737731933, + "step": 4080 + }, + { + "epoch": 0.7432678311499272, + "grad_norm": 0.15061035752296448, + "learning_rate": 8.198928652705204e-06, + "loss": 0.13571925163269044, + "step": 4085 + }, + { + "epoch": 0.7441775836972343, + "grad_norm": 0.18075402081012726, + "learning_rate": 8.144493874011908e-06, + "loss": 0.14385528564453126, + "step": 4090 + }, + { + "epoch": 0.7450873362445415, + "grad_norm": 0.16514739394187927, + "learning_rate": 8.090205216236135e-06, + "loss": 0.14920626878738402, + "step": 4095 + }, + { + "epoch": 0.7459970887918487, + "grad_norm": 0.16453702747821808, + "learning_rate": 8.03606315000797e-06, + "loss": 0.14704222679138185, + "step": 4100 + }, + { + "epoch": 0.7469068413391557, + "grad_norm": 0.16719917953014374, + "learning_rate": 7.982068144686707e-06, + "loss": 0.14722511768341065, + "step": 4105 + }, + { + "epoch": 0.7478165938864629, + "grad_norm": 0.18499110639095306, + "learning_rate": 7.92822066835677e-06, + "loss": 0.1401848554611206, + "step": 4110 + }, + { + "epoch": 0.74872634643377, + "grad_norm": 0.17249563336372375, + "learning_rate": 7.87452118782363e-06, + "loss": 0.15132423639297485, + "step": 4115 + }, + { + "epoch": 0.7496360989810772, + "grad_norm": 0.15049682557582855, + "learning_rate": 7.8209701686098e-06, + "loss": 0.1341150164604187, + "step": 4120 + }, + { + "epoch": 0.7505458515283843, + "grad_norm": 0.16892646253108978, + "learning_rate": 7.767568074950751e-06, + "loss": 0.1466840147972107, + "step": 4125 + }, + { + "epoch": 0.7514556040756915, + "grad_norm": 0.17288286983966827, + "learning_rate": 7.714315369790942e-06, + "loss": 0.13819680213928223, + "step": 4130 + }, + { + "epoch": 0.7523653566229985, + "grad_norm": 0.21893996000289917, + "learning_rate": 7.661212514779745e-06, + "loss": 0.14369510412216185, + "step": 4135 + }, + { + "epoch": 0.7532751091703057, + "grad_norm": 0.1674601435661316, + "learning_rate": 7.608259970267509e-06, + "loss": 0.14810250997543334, + "step": 4140 + }, + { + "epoch": 0.7541848617176128, + "grad_norm": 0.15875539183616638, + "learning_rate": 7.555458195301526e-06, + "loss": 0.14103198051452637, + "step": 4145 + }, + { + "epoch": 0.75509461426492, + "grad_norm": 0.19454079866409302, + "learning_rate": 7.502807647622037e-06, + "loss": 0.13848764896392823, + "step": 4150 + }, + { + "epoch": 0.756004366812227, + "grad_norm": 0.1795455813407898, + "learning_rate": 7.450308783658341e-06, + "loss": 0.14459335803985596, + "step": 4155 + }, + { + "epoch": 0.7569141193595342, + "grad_norm": 0.1643362045288086, + "learning_rate": 7.397962058524735e-06, + "loss": 0.14335378408432006, + "step": 4160 + }, + { + "epoch": 0.7578238719068413, + "grad_norm": 0.16362066566944122, + "learning_rate": 7.3457679260166475e-06, + "loss": 0.14222005605697632, + "step": 4165 + }, + { + "epoch": 0.7587336244541485, + "grad_norm": 0.17313003540039062, + "learning_rate": 7.293726838606674e-06, + "loss": 0.14272255897521974, + "step": 4170 + }, + { + "epoch": 0.7596433770014556, + "grad_norm": 0.1809929460287094, + "learning_rate": 7.2418392474406405e-06, + "loss": 0.14089123010635377, + "step": 4175 + }, + { + "epoch": 0.7605531295487628, + "grad_norm": 0.14306005835533142, + "learning_rate": 7.19010560233373e-06, + "loss": 0.13531534671783446, + "step": 4180 + }, + { + "epoch": 0.7614628820960698, + "grad_norm": 0.15525390207767487, + "learning_rate": 7.138526351766559e-06, + "loss": 0.14340845346450806, + "step": 4185 + }, + { + "epoch": 0.762372634643377, + "grad_norm": 0.24478943645954132, + "learning_rate": 7.087101942881263e-06, + "loss": 0.14744555950164795, + "step": 4190 + }, + { + "epoch": 0.7632823871906841, + "grad_norm": 0.31335577368736267, + "learning_rate": 7.035832821477711e-06, + "loss": 0.1484094500541687, + "step": 4195 + }, + { + "epoch": 0.7641921397379913, + "grad_norm": 0.15140366554260254, + "learning_rate": 6.984719432009515e-06, + "loss": 0.14991614818572999, + "step": 4200 + }, + { + "epoch": 0.7651018922852983, + "grad_norm": 0.16125506162643433, + "learning_rate": 6.933762217580289e-06, + "loss": 0.1408134937286377, + "step": 4205 + }, + { + "epoch": 0.7660116448326055, + "grad_norm": 0.2501450181007385, + "learning_rate": 6.882961619939726e-06, + "loss": 0.13875640630722047, + "step": 4210 + }, + { + "epoch": 0.7669213973799127, + "grad_norm": 0.16227811574935913, + "learning_rate": 6.8323180794798245e-06, + "loss": 0.14138660430908204, + "step": 4215 + }, + { + "epoch": 0.7678311499272198, + "grad_norm": 0.16676810383796692, + "learning_rate": 6.781832035231053e-06, + "loss": 0.14696706533432008, + "step": 4220 + }, + { + "epoch": 0.768740902474527, + "grad_norm": 0.14638574421405792, + "learning_rate": 6.731503924858518e-06, + "loss": 0.14263020753860473, + "step": 4225 + }, + { + "epoch": 0.769650655021834, + "grad_norm": 0.17093190550804138, + "learning_rate": 6.681334184658211e-06, + "loss": 0.14694111347198485, + "step": 4230 + }, + { + "epoch": 0.7705604075691412, + "grad_norm": 0.17174287140369415, + "learning_rate": 6.631323249553201e-06, + "loss": 0.13854929208755493, + "step": 4235 + }, + { + "epoch": 0.7714701601164483, + "grad_norm": 0.14599016308784485, + "learning_rate": 6.5814715530898745e-06, + "loss": 0.14058833122253417, + "step": 4240 + }, + { + "epoch": 0.7723799126637555, + "grad_norm": 0.16222265362739563, + "learning_rate": 6.531779527434176e-06, + "loss": 0.1428326725959778, + "step": 4245 + }, + { + "epoch": 0.7732896652110626, + "grad_norm": 0.1741994023323059, + "learning_rate": 6.482247603367839e-06, + "loss": 0.13985042572021483, + "step": 4250 + }, + { + "epoch": 0.7741994177583698, + "grad_norm": 0.17427101731300354, + "learning_rate": 6.432876210284688e-06, + "loss": 0.1442667603492737, + "step": 4255 + }, + { + "epoch": 0.7751091703056768, + "grad_norm": 0.1665259599685669, + "learning_rate": 6.383665776186912e-06, + "loss": 0.1421986222267151, + "step": 4260 + }, + { + "epoch": 0.776018922852984, + "grad_norm": 0.1728232353925705, + "learning_rate": 6.334616727681303e-06, + "loss": 0.1367053508758545, + "step": 4265 + }, + { + "epoch": 0.7769286754002911, + "grad_norm": 0.15882381796836853, + "learning_rate": 6.285729489975639e-06, + "loss": 0.14551182985305786, + "step": 4270 + }, + { + "epoch": 0.7778384279475983, + "grad_norm": 0.242042675614357, + "learning_rate": 6.2370044868749115e-06, + "loss": 0.1455132007598877, + "step": 4275 + }, + { + "epoch": 0.7787481804949054, + "grad_norm": 0.1599501073360443, + "learning_rate": 6.188442140777742e-06, + "loss": 0.1424942970275879, + "step": 4280 + }, + { + "epoch": 0.7796579330422125, + "grad_norm": 0.15182635188102722, + "learning_rate": 6.140042872672647e-06, + "loss": 0.14212887287139891, + "step": 4285 + }, + { + "epoch": 0.7805676855895196, + "grad_norm": 0.1720375418663025, + "learning_rate": 6.091807102134403e-06, + "loss": 0.14243412017822266, + "step": 4290 + }, + { + "epoch": 0.7814774381368268, + "grad_norm": 0.16436047852039337, + "learning_rate": 6.043735247320454e-06, + "loss": 0.15035657882690429, + "step": 4295 + }, + { + "epoch": 0.7823871906841339, + "grad_norm": 0.1498408019542694, + "learning_rate": 5.995827724967218e-06, + "loss": 0.14494839906692505, + "step": 4300 + }, + { + "epoch": 0.7832969432314411, + "grad_norm": 0.16924560070037842, + "learning_rate": 5.948084950386535e-06, + "loss": 0.13581212759017944, + "step": 4305 + }, + { + "epoch": 0.7842066957787481, + "grad_norm": 0.15889139473438263, + "learning_rate": 5.900507337462036e-06, + "loss": 0.15071530342102052, + "step": 4310 + }, + { + "epoch": 0.7851164483260553, + "grad_norm": 0.17201054096221924, + "learning_rate": 5.853095298645542e-06, + "loss": 0.1398628830909729, + "step": 4315 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.17965619266033173, + "learning_rate": 5.805849244953548e-06, + "loss": 0.14666696786880493, + "step": 4320 + }, + { + "epoch": 0.7869359534206696, + "grad_norm": 0.17514032125473022, + "learning_rate": 5.758769585963569e-06, + "loss": 0.1383386731147766, + "step": 4325 + }, + { + "epoch": 0.7878457059679768, + "grad_norm": 0.17497631907463074, + "learning_rate": 5.7118567298106744e-06, + "loss": 0.14362354278564454, + "step": 4330 + }, + { + "epoch": 0.7887554585152838, + "grad_norm": 0.16770458221435547, + "learning_rate": 5.665111083183905e-06, + "loss": 0.14136618375778198, + "step": 4335 + }, + { + "epoch": 0.789665211062591, + "grad_norm": 0.17134106159210205, + "learning_rate": 5.618533051322747e-06, + "loss": 0.1401529550552368, + "step": 4340 + }, + { + "epoch": 0.7905749636098981, + "grad_norm": 0.19458788633346558, + "learning_rate": 5.5721230380136435e-06, + "loss": 0.1393273115158081, + "step": 4345 + }, + { + "epoch": 0.7914847161572053, + "grad_norm": 0.19483692944049835, + "learning_rate": 5.525881445586467e-06, + "loss": 0.1369825482368469, + "step": 4350 + }, + { + "epoch": 0.7923944687045124, + "grad_norm": 0.3052191734313965, + "learning_rate": 5.4798086749110495e-06, + "loss": 0.14762181043624878, + "step": 4355 + }, + { + "epoch": 0.7933042212518195, + "grad_norm": 0.164458766579628, + "learning_rate": 5.4339051253937065e-06, + "loss": 0.14501686096191407, + "step": 4360 + }, + { + "epoch": 0.7942139737991266, + "grad_norm": 0.1719193458557129, + "learning_rate": 5.3881711949737625e-06, + "loss": 0.13321092128753662, + "step": 4365 + }, + { + "epoch": 0.7951237263464338, + "grad_norm": 0.17219696938991547, + "learning_rate": 5.342607280120121e-06, + "loss": 0.1413906455039978, + "step": 4370 + }, + { + "epoch": 0.7960334788937409, + "grad_norm": 0.15083056688308716, + "learning_rate": 5.297213775827789e-06, + "loss": 0.14772192239761353, + "step": 4375 + }, + { + "epoch": 0.7969432314410481, + "grad_norm": 0.1699071079492569, + "learning_rate": 5.251991075614507e-06, + "loss": 0.1392375946044922, + "step": 4380 + }, + { + "epoch": 0.7978529839883551, + "grad_norm": 0.1680395007133484, + "learning_rate": 5.206939571517302e-06, + "loss": 0.14185575246810914, + "step": 4385 + }, + { + "epoch": 0.7987627365356623, + "grad_norm": 0.16526710987091064, + "learning_rate": 5.162059654089083e-06, + "loss": 0.15001428127288818, + "step": 4390 + }, + { + "epoch": 0.7996724890829694, + "grad_norm": 0.16281752288341522, + "learning_rate": 5.1173517123952794e-06, + "loss": 0.13747023344039916, + "step": 4395 + }, + { + "epoch": 0.8005822416302766, + "grad_norm": 0.1454378366470337, + "learning_rate": 5.072816134010458e-06, + "loss": 0.14710829257965088, + "step": 4400 + }, + { + "epoch": 0.8014919941775837, + "grad_norm": 0.16565890610218048, + "learning_rate": 5.028453305014966e-06, + "loss": 0.14138611555099487, + "step": 4405 + }, + { + "epoch": 0.8024017467248908, + "grad_norm": 0.1962810605764389, + "learning_rate": 4.984263609991577e-06, + "loss": 0.13836177587509155, + "step": 4410 + }, + { + "epoch": 0.8033114992721979, + "grad_norm": 0.16091369092464447, + "learning_rate": 4.940247432022149e-06, + "loss": 0.14407440423965454, + "step": 4415 + }, + { + "epoch": 0.8042212518195051, + "grad_norm": 0.1930241584777832, + "learning_rate": 4.89640515268433e-06, + "loss": 0.14346336126327514, + "step": 4420 + }, + { + "epoch": 0.8051310043668122, + "grad_norm": 0.19301500916481018, + "learning_rate": 4.852737152048242e-06, + "loss": 0.14174317121505736, + "step": 4425 + }, + { + "epoch": 0.8060407569141194, + "grad_norm": 0.1541353315114975, + "learning_rate": 4.80924380867315e-06, + "loss": 0.14100592136383056, + "step": 4430 + }, + { + "epoch": 0.8069505094614265, + "grad_norm": 0.16285750269889832, + "learning_rate": 4.765925499604243e-06, + "loss": 0.1441288709640503, + "step": 4435 + }, + { + "epoch": 0.8078602620087336, + "grad_norm": 0.17382675409317017, + "learning_rate": 4.722782600369299e-06, + "loss": 0.13763951063156127, + "step": 4440 + }, + { + "epoch": 0.8087700145560408, + "grad_norm": 0.1697344034910202, + "learning_rate": 4.679815484975505e-06, + "loss": 0.1410105347633362, + "step": 4445 + }, + { + "epoch": 0.8096797671033479, + "grad_norm": 0.19964542984962463, + "learning_rate": 4.637024525906131e-06, + "loss": 0.1439276695251465, + "step": 4450 + }, + { + "epoch": 0.8105895196506551, + "grad_norm": 0.165307879447937, + "learning_rate": 4.59441009411736e-06, + "loss": 0.13897504806518554, + "step": 4455 + }, + { + "epoch": 0.8114992721979621, + "grad_norm": 0.16687989234924316, + "learning_rate": 4.551972559035067e-06, + "loss": 0.1422593355178833, + "step": 4460 + }, + { + "epoch": 0.8124090247452693, + "grad_norm": 0.15737789869308472, + "learning_rate": 4.509712288551571e-06, + "loss": 0.1452128052711487, + "step": 4465 + }, + { + "epoch": 0.8133187772925764, + "grad_norm": 0.17116659879684448, + "learning_rate": 4.467629649022509e-06, + "loss": 0.14385371208190917, + "step": 4470 + }, + { + "epoch": 0.8142285298398836, + "grad_norm": 0.17457640171051025, + "learning_rate": 4.425725005263623e-06, + "loss": 0.14808475971221924, + "step": 4475 + }, + { + "epoch": 0.8151382823871907, + "grad_norm": 0.1621970385313034, + "learning_rate": 4.383998720547583e-06, + "loss": 0.13927959203720092, + "step": 4480 + }, + { + "epoch": 0.8160480349344978, + "grad_norm": 0.176296666264534, + "learning_rate": 4.342451156600896e-06, + "loss": 0.15041060447692872, + "step": 4485 + }, + { + "epoch": 0.8169577874818049, + "grad_norm": 0.17157645523548126, + "learning_rate": 4.301082673600698e-06, + "loss": 0.13932652473449708, + "step": 4490 + }, + { + "epoch": 0.8178675400291121, + "grad_norm": 0.15378527343273163, + "learning_rate": 4.259893630171682e-06, + "loss": 0.1406856894493103, + "step": 4495 + }, + { + "epoch": 0.8187772925764192, + "grad_norm": 0.1750226765871048, + "learning_rate": 4.218884383382987e-06, + "loss": 0.1350164532661438, + "step": 4500 + }, + { + "epoch": 0.8196870451237264, + "grad_norm": 0.1393742561340332, + "learning_rate": 4.178055288745053e-06, + "loss": 0.13769235610961914, + "step": 4505 + }, + { + "epoch": 0.8205967976710334, + "grad_norm": 0.1668994128704071, + "learning_rate": 4.137406700206617e-06, + "loss": 0.14029752016067504, + "step": 4510 + }, + { + "epoch": 0.8215065502183406, + "grad_norm": 0.1833454668521881, + "learning_rate": 4.0969389701515675e-06, + "loss": 0.14276301860809326, + "step": 4515 + }, + { + "epoch": 0.8224163027656477, + "grad_norm": 0.16187874972820282, + "learning_rate": 4.056652449395945e-06, + "loss": 0.1444832682609558, + "step": 4520 + }, + { + "epoch": 0.8233260553129549, + "grad_norm": 0.1453280746936798, + "learning_rate": 4.01654748718488e-06, + "loss": 0.14512733221054078, + "step": 4525 + }, + { + "epoch": 0.824235807860262, + "grad_norm": 0.1782725751399994, + "learning_rate": 3.976624431189563e-06, + "loss": 0.14093561172485353, + "step": 4530 + }, + { + "epoch": 0.8251455604075691, + "grad_norm": 0.17374491691589355, + "learning_rate": 3.936883627504234e-06, + "loss": 0.14031401872634888, + "step": 4535 + }, + { + "epoch": 0.8260553129548762, + "grad_norm": 0.1609172821044922, + "learning_rate": 3.897325420643174e-06, + "loss": 0.1428336262702942, + "step": 4540 + }, + { + "epoch": 0.8269650655021834, + "grad_norm": 0.1520884931087494, + "learning_rate": 3.85795015353774e-06, + "loss": 0.1460547924041748, + "step": 4545 + }, + { + "epoch": 0.8278748180494906, + "grad_norm": 0.20986326038837433, + "learning_rate": 3.818758167533376e-06, + "loss": 0.14706350564956666, + "step": 4550 + }, + { + "epoch": 0.8287845705967977, + "grad_norm": 0.16825413703918457, + "learning_rate": 3.7797498023866396e-06, + "loss": 0.14507200717926025, + "step": 4555 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.16758380830287933, + "learning_rate": 3.740925396262296e-06, + "loss": 0.14898381233215333, + "step": 4560 + }, + { + "epoch": 0.8306040756914119, + "grad_norm": 0.15207453072071075, + "learning_rate": 3.7022852857303503e-06, + "loss": 0.14138854742050172, + "step": 4565 + }, + { + "epoch": 0.8315138282387191, + "grad_norm": 0.15150749683380127, + "learning_rate": 3.66382980576315e-06, + "loss": 0.13894975185394287, + "step": 4570 + }, + { + "epoch": 0.8324235807860262, + "grad_norm": 0.17071188986301422, + "learning_rate": 3.625559289732472e-06, + "loss": 0.14072470664978026, + "step": 4575 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.154335618019104, + "learning_rate": 3.5874740694066294e-06, + "loss": 0.13791344165802003, + "step": 4580 + }, + { + "epoch": 0.8342430858806404, + "grad_norm": 0.14017128944396973, + "learning_rate": 3.5495744749476116e-06, + "loss": 0.14427922964096068, + "step": 4585 + }, + { + "epoch": 0.8351528384279476, + "grad_norm": 0.17210033535957336, + "learning_rate": 3.5118608349081983e-06, + "loss": 0.15191166400909423, + "step": 4590 + }, + { + "epoch": 0.8360625909752547, + "grad_norm": 0.18715685606002808, + "learning_rate": 3.4743334762291358e-06, + "loss": 0.14451316595077515, + "step": 4595 + }, + { + "epoch": 0.8369723435225619, + "grad_norm": 0.18079884350299835, + "learning_rate": 3.436992724236293e-06, + "loss": 0.13530746698379517, + "step": 4600 + }, + { + "epoch": 0.837882096069869, + "grad_norm": 0.13519920408725739, + "learning_rate": 3.399838902637817e-06, + "loss": 0.1477964401245117, + "step": 4605 + }, + { + "epoch": 0.8387918486171762, + "grad_norm": 0.1778026670217514, + "learning_rate": 3.3628723335213885e-06, + "loss": 0.14419831037521363, + "step": 4610 + }, + { + "epoch": 0.8397016011644832, + "grad_norm": 0.15165366232395172, + "learning_rate": 3.326093337351355e-06, + "loss": 0.13888469934463502, + "step": 4615 + }, + { + "epoch": 0.8406113537117904, + "grad_norm": 0.17049473524093628, + "learning_rate": 3.2895022329660018e-06, + "loss": 0.14438477754592896, + "step": 4620 + }, + { + "epoch": 0.8415211062590975, + "grad_norm": 0.16536414623260498, + "learning_rate": 3.2530993375747833e-06, + "loss": 0.1444351315498352, + "step": 4625 + }, + { + "epoch": 0.8424308588064047, + "grad_norm": 0.17570015788078308, + "learning_rate": 3.2168849667555402e-06, + "loss": 0.13861945867538453, + "step": 4630 + }, + { + "epoch": 0.8433406113537117, + "grad_norm": 0.1699545532464981, + "learning_rate": 3.1808594344518132e-06, + "loss": 0.13902754783630372, + "step": 4635 + }, + { + "epoch": 0.8442503639010189, + "grad_norm": 0.12331254780292511, + "learning_rate": 3.1450230529700837e-06, + "loss": 0.14104254245758058, + "step": 4640 + }, + { + "epoch": 0.845160116448326, + "grad_norm": 0.1508190929889679, + "learning_rate": 3.1093761329770708e-06, + "loss": 0.13288766145706177, + "step": 4645 + }, + { + "epoch": 0.8460698689956332, + "grad_norm": 0.19049489498138428, + "learning_rate": 3.0739189834970735e-06, + "loss": 0.14914840459823608, + "step": 4650 + }, + { + "epoch": 0.8469796215429404, + "grad_norm": 0.1662369966506958, + "learning_rate": 3.0386519119092293e-06, + "loss": 0.14222898483276367, + "step": 4655 + }, + { + "epoch": 0.8478893740902474, + "grad_norm": 0.18985967338085175, + "learning_rate": 3.0035752239449126e-06, + "loss": 0.14431113004684448, + "step": 4660 + }, + { + "epoch": 0.8487991266375546, + "grad_norm": 0.17005261778831482, + "learning_rate": 2.9686892236850337e-06, + "loss": 0.14140807390213012, + "step": 4665 + }, + { + "epoch": 0.8497088791848617, + "grad_norm": 0.16786684095859528, + "learning_rate": 2.9339942135574394e-06, + "loss": 0.14161460399627684, + "step": 4670 + }, + { + "epoch": 0.8506186317321689, + "grad_norm": 0.16358181834220886, + "learning_rate": 2.899490494334281e-06, + "loss": 0.14674670696258546, + "step": 4675 + }, + { + "epoch": 0.851528384279476, + "grad_norm": 0.1651349812746048, + "learning_rate": 2.8651783651293867e-06, + "loss": 0.13794611692428588, + "step": 4680 + }, + { + "epoch": 0.8524381368267832, + "grad_norm": 0.16934923827648163, + "learning_rate": 2.831058123395694e-06, + "loss": 0.13199397325515747, + "step": 4685 + }, + { + "epoch": 0.8533478893740902, + "grad_norm": 0.1704150140285492, + "learning_rate": 2.797130064922665e-06, + "loss": 0.14044904708862305, + "step": 4690 + }, + { + "epoch": 0.8542576419213974, + "grad_norm": 0.1814192682504654, + "learning_rate": 2.7633944838337143e-06, + "loss": 0.1465100646018982, + "step": 4695 + }, + { + "epoch": 0.8551673944687045, + "grad_norm": 0.18942610919475555, + "learning_rate": 2.729851672583669e-06, + "loss": 0.14685982465744019, + "step": 4700 + }, + { + "epoch": 0.8560771470160117, + "grad_norm": 0.17895208299160004, + "learning_rate": 2.6965019219562155e-06, + "loss": 0.13971571922302245, + "step": 4705 + }, + { + "epoch": 0.8569868995633187, + "grad_norm": 0.22735828161239624, + "learning_rate": 2.6633455210614055e-06, + "loss": 0.13776102066040039, + "step": 4710 + }, + { + "epoch": 0.8578966521106259, + "grad_norm": 0.16779793798923492, + "learning_rate": 2.630382757333133e-06, + "loss": 0.14134042263031005, + "step": 4715 + }, + { + "epoch": 0.858806404657933, + "grad_norm": 0.2148888260126114, + "learning_rate": 2.597613916526637e-06, + "loss": 0.14680721759796142, + "step": 4720 + }, + { + "epoch": 0.8597161572052402, + "grad_norm": 0.16560257971286774, + "learning_rate": 2.565039282716045e-06, + "loss": 0.14137234687805175, + "step": 4725 + }, + { + "epoch": 0.8606259097525473, + "grad_norm": 0.16197068989276886, + "learning_rate": 2.532659138291879e-06, + "loss": 0.14969314336776735, + "step": 4730 + }, + { + "epoch": 0.8615356622998545, + "grad_norm": 0.14650246500968933, + "learning_rate": 2.5004737639586497e-06, + "loss": 0.13532910346984864, + "step": 4735 + }, + { + "epoch": 0.8624454148471615, + "grad_norm": 0.1565634310245514, + "learning_rate": 2.4684834387323943e-06, + "loss": 0.14146244525909424, + "step": 4740 + }, + { + "epoch": 0.8633551673944687, + "grad_norm": 0.18060864508152008, + "learning_rate": 2.4366884399382393e-06, + "loss": 0.14218534231185914, + "step": 4745 + }, + { + "epoch": 0.8642649199417758, + "grad_norm": 0.24613255262374878, + "learning_rate": 2.4050890432080557e-06, + "loss": 0.13907679319381713, + "step": 4750 + }, + { + "epoch": 0.865174672489083, + "grad_norm": 0.16036023199558258, + "learning_rate": 2.3736855224780057e-06, + "loss": 0.13718113899230958, + "step": 4755 + }, + { + "epoch": 0.86608442503639, + "grad_norm": 0.16678516566753387, + "learning_rate": 2.3424781499862075e-06, + "loss": 0.1327962040901184, + "step": 4760 + }, + { + "epoch": 0.8669941775836972, + "grad_norm": 0.1763770878314972, + "learning_rate": 2.3114671962703727e-06, + "loss": 0.14390318393707274, + "step": 4765 + }, + { + "epoch": 0.8679039301310044, + "grad_norm": 0.17735697329044342, + "learning_rate": 2.280652930165428e-06, + "loss": 0.15223288536071777, + "step": 4770 + }, + { + "epoch": 0.8688136826783115, + "grad_norm": 0.15827041864395142, + "learning_rate": 2.250035618801241e-06, + "loss": 0.14296332597732545, + "step": 4775 + }, + { + "epoch": 0.8697234352256187, + "grad_norm": 0.16876135766506195, + "learning_rate": 2.219615527600244e-06, + "loss": 0.1359076738357544, + "step": 4780 + }, + { + "epoch": 0.8706331877729258, + "grad_norm": 0.1800110638141632, + "learning_rate": 2.189392920275174e-06, + "loss": 0.1424281358718872, + "step": 4785 + }, + { + "epoch": 0.8715429403202329, + "grad_norm": 0.1409560889005661, + "learning_rate": 2.159368058826783e-06, + "loss": 0.14480490684509278, + "step": 4790 + }, + { + "epoch": 0.87245269286754, + "grad_norm": 0.1634288728237152, + "learning_rate": 2.129541203541535e-06, + "loss": 0.14513269662857056, + "step": 4795 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.17126062512397766, + "learning_rate": 2.099912612989391e-06, + "loss": 0.13546934127807617, + "step": 4800 + }, + { + "epoch": 0.8742721979621543, + "grad_norm": 0.16704080998897552, + "learning_rate": 2.0704825440215457e-06, + "loss": 0.13852492570877076, + "step": 4805 + }, + { + "epoch": 0.8751819505094615, + "grad_norm": 0.1725970208644867, + "learning_rate": 2.0412512517681946e-06, + "loss": 0.14504197835922242, + "step": 4810 + }, + { + "epoch": 0.8760917030567685, + "grad_norm": 0.1700201779603958, + "learning_rate": 2.0122189896363387e-06, + "loss": 0.14312338829040527, + "step": 4815 + }, + { + "epoch": 0.8770014556040757, + "grad_norm": 0.16491736471652985, + "learning_rate": 1.9833860093075834e-06, + "loss": 0.14062976837158203, + "step": 4820 + }, + { + "epoch": 0.8779112081513828, + "grad_norm": 0.13748787343502045, + "learning_rate": 1.9547525607359537e-06, + "loss": 0.1346171498298645, + "step": 4825 + }, + { + "epoch": 0.87882096069869, + "grad_norm": 0.16399399936199188, + "learning_rate": 1.926318892145712e-06, + "loss": 0.14178123474121093, + "step": 4830 + }, + { + "epoch": 0.879730713245997, + "grad_norm": 0.14491963386535645, + "learning_rate": 1.8980852500292412e-06, + "loss": 0.1408564567565918, + "step": 4835 + }, + { + "epoch": 0.8806404657933042, + "grad_norm": 0.17335423827171326, + "learning_rate": 1.8700518791448851e-06, + "loss": 0.14403265714645386, + "step": 4840 + }, + { + "epoch": 0.8815502183406113, + "grad_norm": 0.17399625480175018, + "learning_rate": 1.8422190225148155e-06, + "loss": 0.14289036989212037, + "step": 4845 + }, + { + "epoch": 0.8824599708879185, + "grad_norm": 0.17945612967014313, + "learning_rate": 1.814586921422956e-06, + "loss": 0.14494109153747559, + "step": 4850 + }, + { + "epoch": 0.8833697234352256, + "grad_norm": 0.1910620480775833, + "learning_rate": 1.7871558154128664e-06, + "loss": 0.13726245164871215, + "step": 4855 + }, + { + "epoch": 0.8842794759825328, + "grad_norm": 0.1771879345178604, + "learning_rate": 1.7599259422856756e-06, + "loss": 0.1464752197265625, + "step": 4860 + }, + { + "epoch": 0.8851892285298398, + "grad_norm": 0.19427461922168732, + "learning_rate": 1.7328975380980218e-06, + "loss": 0.13823356628417968, + "step": 4865 + }, + { + "epoch": 0.886098981077147, + "grad_norm": 0.1491149365901947, + "learning_rate": 1.7060708371599897e-06, + "loss": 0.1338604211807251, + "step": 4870 + }, + { + "epoch": 0.8870087336244541, + "grad_norm": 0.16087733209133148, + "learning_rate": 1.6794460720331057e-06, + "loss": 0.14184389114379883, + "step": 4875 + }, + { + "epoch": 0.8879184861717613, + "grad_norm": 0.14506325125694275, + "learning_rate": 1.653023473528309e-06, + "loss": 0.14267687797546386, + "step": 4880 + }, + { + "epoch": 0.8888282387190685, + "grad_norm": 0.16886365413665771, + "learning_rate": 1.626803270703936e-06, + "loss": 0.14266083240509034, + "step": 4885 + }, + { + "epoch": 0.8897379912663755, + "grad_norm": 0.1891999989748001, + "learning_rate": 1.6007856908637652e-06, + "loss": 0.1398016929626465, + "step": 4890 + }, + { + "epoch": 0.8906477438136827, + "grad_norm": 0.17645299434661865, + "learning_rate": 1.5749709595550083e-06, + "loss": 0.13869571685791016, + "step": 4895 + }, + { + "epoch": 0.8915574963609898, + "grad_norm": 0.17714262008666992, + "learning_rate": 1.549359300566408e-06, + "loss": 0.14957486391067504, + "step": 4900 + }, + { + "epoch": 0.892467248908297, + "grad_norm": 0.18025240302085876, + "learning_rate": 1.5239509359262355e-06, + "loss": 0.1358652949333191, + "step": 4905 + }, + { + "epoch": 0.8933770014556041, + "grad_norm": 0.17539937794208527, + "learning_rate": 1.4987460859004154e-06, + "loss": 0.13833394050598144, + "step": 4910 + }, + { + "epoch": 0.8942867540029112, + "grad_norm": 0.1772230565547943, + "learning_rate": 1.4737449689905953e-06, + "loss": 0.14202116727828978, + "step": 4915 + }, + { + "epoch": 0.8951965065502183, + "grad_norm": 0.1670161783695221, + "learning_rate": 1.4489478019322433e-06, + "loss": 0.1403665542602539, + "step": 4920 + }, + { + "epoch": 0.8961062590975255, + "grad_norm": 0.1697034239768982, + "learning_rate": 1.4243547996927926e-06, + "loss": 0.1401481032371521, + "step": 4925 + }, + { + "epoch": 0.8970160116448326, + "grad_norm": 0.16474860906600952, + "learning_rate": 1.3999661754697636e-06, + "loss": 0.13969850540161133, + "step": 4930 + }, + { + "epoch": 0.8979257641921398, + "grad_norm": 0.1664883941411972, + "learning_rate": 1.3757821406889027e-06, + "loss": 0.1399069309234619, + "step": 4935 + }, + { + "epoch": 0.8988355167394468, + "grad_norm": 0.16675794124603271, + "learning_rate": 1.351802905002386e-06, + "loss": 0.14129226207733153, + "step": 4940 + }, + { + "epoch": 0.899745269286754, + "grad_norm": 0.17529809474945068, + "learning_rate": 1.3280286762869632e-06, + "loss": 0.14663081169128417, + "step": 4945 + }, + { + "epoch": 0.9006550218340611, + "grad_norm": 0.17758169770240784, + "learning_rate": 1.3044596606421795e-06, + "loss": 0.13986254930496217, + "step": 4950 + }, + { + "epoch": 0.9015647743813683, + "grad_norm": 0.153225839138031, + "learning_rate": 1.2810960623885815e-06, + "loss": 0.14236698150634766, + "step": 4955 + }, + { + "epoch": 0.9024745269286754, + "grad_norm": 0.169761523604393, + "learning_rate": 1.2579380840659376e-06, + "loss": 0.1450445055961609, + "step": 4960 + }, + { + "epoch": 0.9033842794759825, + "grad_norm": 0.16659331321716309, + "learning_rate": 1.2349859264315034e-06, + "loss": 0.14043926000595092, + "step": 4965 + }, + { + "epoch": 0.9042940320232896, + "grad_norm": 0.16748706996440887, + "learning_rate": 1.2122397884582553e-06, + "loss": 0.14725675582885742, + "step": 4970 + }, + { + "epoch": 0.9052037845705968, + "grad_norm": 0.1600511223077774, + "learning_rate": 1.1896998673331883e-06, + "loss": 0.14551150798797607, + "step": 4975 + }, + { + "epoch": 0.9061135371179039, + "grad_norm": 0.24318362772464752, + "learning_rate": 1.1673663584555934e-06, + "loss": 0.14470888376235963, + "step": 4980 + }, + { + "epoch": 0.9070232896652111, + "grad_norm": 0.16443821787834167, + "learning_rate": 1.1452394554353706e-06, + "loss": 0.13639854192733764, + "step": 4985 + }, + { + "epoch": 0.9079330422125182, + "grad_norm": 0.14277774095535278, + "learning_rate": 1.1233193500913453e-06, + "loss": 0.13749881982803344, + "step": 4990 + }, + { + "epoch": 0.9088427947598253, + "grad_norm": 0.1610947549343109, + "learning_rate": 1.1016062324496008e-06, + "loss": 0.1385629653930664, + "step": 4995 + }, + { + "epoch": 0.9097525473071325, + "grad_norm": 0.17888498306274414, + "learning_rate": 1.080100290741845e-06, + "loss": 0.14225621223449708, + "step": 5000 + }, + { + "epoch": 0.9106622998544396, + "grad_norm": 0.17488449811935425, + "learning_rate": 1.0588017114037729e-06, + "loss": 0.14187805652618407, + "step": 5005 + }, + { + "epoch": 0.9115720524017468, + "grad_norm": 0.16410665214061737, + "learning_rate": 1.0377106790734392e-06, + "loss": 0.1407416582107544, + "step": 5010 + }, + { + "epoch": 0.9124818049490538, + "grad_norm": 0.18115971982479095, + "learning_rate": 1.016827376589674e-06, + "loss": 0.1427263855934143, + "step": 5015 + }, + { + "epoch": 0.913391557496361, + "grad_norm": 0.18507841229438782, + "learning_rate": 9.961519849904898e-07, + "loss": 0.1390499472618103, + "step": 5020 + }, + { + "epoch": 0.9143013100436681, + "grad_norm": 0.21296796202659607, + "learning_rate": 9.75684683511513e-07, + "loss": 0.1382216691970825, + "step": 5025 + }, + { + "epoch": 0.9152110625909753, + "grad_norm": 0.2308044582605362, + "learning_rate": 9.55425649584435e-07, + "loss": 0.14271280765533448, + "step": 5030 + }, + { + "epoch": 0.9161208151382824, + "grad_norm": 0.15796682238578796, + "learning_rate": 9.353750588354527e-07, + "loss": 0.13807624578475952, + "step": 5035 + }, + { + "epoch": 0.9170305676855895, + "grad_norm": 0.1695316582918167, + "learning_rate": 9.155330850837834e-07, + "loss": 0.14289476871490478, + "step": 5040 + }, + { + "epoch": 0.9179403202328966, + "grad_norm": 0.1738404780626297, + "learning_rate": 8.958999003401191e-07, + "loss": 0.14070619344711305, + "step": 5045 + }, + { + "epoch": 0.9188500727802038, + "grad_norm": 0.20618964731693268, + "learning_rate": 8.764756748051662e-07, + "loss": 0.14535053968429565, + "step": 5050 + }, + { + "epoch": 0.9197598253275109, + "grad_norm": 0.1506137251853943, + "learning_rate": 8.572605768681546e-07, + "loss": 0.13995139598846434, + "step": 5055 + }, + { + "epoch": 0.9206695778748181, + "grad_norm": 0.17772039771080017, + "learning_rate": 8.382547731053708e-07, + "loss": 0.14470311403274536, + "step": 5060 + }, + { + "epoch": 0.9215793304221251, + "grad_norm": 0.19897456467151642, + "learning_rate": 8.194584282787382e-07, + "loss": 0.144488525390625, + "step": 5065 + }, + { + "epoch": 0.9224890829694323, + "grad_norm": 0.15899236500263214, + "learning_rate": 8.008717053343606e-07, + "loss": 0.1352991580963135, + "step": 5070 + }, + { + "epoch": 0.9233988355167394, + "grad_norm": 0.14965768158435822, + "learning_rate": 7.824947654011345e-07, + "loss": 0.13827911615371705, + "step": 5075 + }, + { + "epoch": 0.9243085880640466, + "grad_norm": 0.43651485443115234, + "learning_rate": 7.643277677893329e-07, + "loss": 0.14149526357650757, + "step": 5080 + }, + { + "epoch": 0.9252183406113537, + "grad_norm": 0.19912713766098022, + "learning_rate": 7.463708699892325e-07, + "loss": 0.14357032775878906, + "step": 5085 + }, + { + "epoch": 0.9261280931586608, + "grad_norm": 0.1635904610157013, + "learning_rate": 7.286242276697524e-07, + "loss": 0.13550699949264527, + "step": 5090 + }, + { + "epoch": 0.9270378457059679, + "grad_norm": 0.19391080737113953, + "learning_rate": 7.11087994677101e-07, + "loss": 0.14674756526947022, + "step": 5095 + }, + { + "epoch": 0.9279475982532751, + "grad_norm": 0.17458125948905945, + "learning_rate": 6.937623230334284e-07, + "loss": 0.14155579805374147, + "step": 5100 + }, + { + "epoch": 0.9288573508005823, + "grad_norm": 0.1617971807718277, + "learning_rate": 6.766473629355452e-07, + "loss": 0.140555477142334, + "step": 5105 + }, + { + "epoch": 0.9297671033478894, + "grad_norm": 0.16945427656173706, + "learning_rate": 6.59743262753576e-07, + "loss": 0.13607511520385743, + "step": 5110 + }, + { + "epoch": 0.9306768558951966, + "grad_norm": 0.18347840011119843, + "learning_rate": 6.43050169029702e-07, + "loss": 0.14903461933135986, + "step": 5115 + }, + { + "epoch": 0.9315866084425036, + "grad_norm": 0.15434837341308594, + "learning_rate": 6.265682264768869e-07, + "loss": 0.14146015644073487, + "step": 5120 + }, + { + "epoch": 0.9324963609898108, + "grad_norm": 0.1397712528705597, + "learning_rate": 6.10297577977606e-07, + "loss": 0.14261592626571656, + "step": 5125 + }, + { + "epoch": 0.9334061135371179, + "grad_norm": 0.1765873283147812, + "learning_rate": 5.942383645826361e-07, + "loss": 0.13559447526931762, + "step": 5130 + }, + { + "epoch": 0.9343158660844251, + "grad_norm": 0.1656057983636856, + "learning_rate": 5.783907255098003e-07, + "loss": 0.13961490392684936, + "step": 5135 + }, + { + "epoch": 0.9352256186317321, + "grad_norm": 0.2169366180896759, + "learning_rate": 5.627547981427894e-07, + "loss": 0.1447835922241211, + "step": 5140 + }, + { + "epoch": 0.9361353711790393, + "grad_norm": 0.18623125553131104, + "learning_rate": 5.473307180299508e-07, + "loss": 0.14366730451583862, + "step": 5145 + }, + { + "epoch": 0.9370451237263464, + "grad_norm": 0.15423963963985443, + "learning_rate": 5.32118618883129e-07, + "loss": 0.14295632839202882, + "step": 5150 + }, + { + "epoch": 0.9379548762736536, + "grad_norm": 0.18423247337341309, + "learning_rate": 5.17118632576491e-07, + "loss": 0.14137414693832398, + "step": 5155 + }, + { + "epoch": 0.9388646288209607, + "grad_norm": 0.15338757634162903, + "learning_rate": 5.023308891453915e-07, + "loss": 0.13583066463470458, + "step": 5160 + }, + { + "epoch": 0.9397743813682679, + "grad_norm": 0.2293633222579956, + "learning_rate": 4.877555167852515e-07, + "loss": 0.14819620847702025, + "step": 5165 + }, + { + "epoch": 0.9406841339155749, + "grad_norm": 0.16889944672584534, + "learning_rate": 4.7339264185043974e-07, + "loss": 0.13617686033248902, + "step": 5170 + }, + { + "epoch": 0.9415938864628821, + "grad_norm": 0.1767464578151703, + "learning_rate": 4.5924238885316775e-07, + "loss": 0.13487552404403685, + "step": 5175 + }, + { + "epoch": 0.9425036390101892, + "grad_norm": 0.16697899997234344, + "learning_rate": 4.453048804624327e-07, + "loss": 0.1446886420249939, + "step": 5180 + }, + { + "epoch": 0.9434133915574964, + "grad_norm": 0.19576266407966614, + "learning_rate": 4.315802375029293e-07, + "loss": 0.14252450466156005, + "step": 5185 + }, + { + "epoch": 0.9443231441048034, + "grad_norm": 0.14838077127933502, + "learning_rate": 4.18068578954034e-07, + "loss": 0.13933032751083374, + "step": 5190 + }, + { + "epoch": 0.9452328966521106, + "grad_norm": 0.18481744825839996, + "learning_rate": 4.047700219487388e-07, + "loss": 0.1410665273666382, + "step": 5195 + }, + { + "epoch": 0.9461426491994177, + "grad_norm": 0.16954176127910614, + "learning_rate": 3.9168468177265547e-07, + "loss": 0.1421758770942688, + "step": 5200 + }, + { + "epoch": 0.9470524017467249, + "grad_norm": 0.17614421248435974, + "learning_rate": 3.7881267186301306e-07, + "loss": 0.14059911966323851, + "step": 5205 + }, + { + "epoch": 0.9479621542940321, + "grad_norm": 0.1637226939201355, + "learning_rate": 3.6615410380767544e-07, + "loss": 0.1360395908355713, + "step": 5210 + }, + { + "epoch": 0.9488719068413392, + "grad_norm": 0.18330250680446625, + "learning_rate": 3.5370908734417006e-07, + "loss": 0.14543824195861815, + "step": 5215 + }, + { + "epoch": 0.9497816593886463, + "grad_norm": 0.1895420402288437, + "learning_rate": 3.414777303587413e-07, + "loss": 0.15304578542709352, + "step": 5220 + }, + { + "epoch": 0.9506914119359534, + "grad_norm": 0.15384933352470398, + "learning_rate": 3.294601388854041e-07, + "loss": 0.14675912857055665, + "step": 5225 + }, + { + "epoch": 0.9516011644832606, + "grad_norm": 0.20188499987125397, + "learning_rate": 3.1765641710505e-07, + "loss": 0.14068362712860108, + "step": 5230 + }, + { + "epoch": 0.9525109170305677, + "grad_norm": 0.16467279195785522, + "learning_rate": 3.060666673445123e-07, + "loss": 0.14733167886734008, + "step": 5235 + }, + { + "epoch": 0.9534206695778749, + "grad_norm": 0.16632016003131866, + "learning_rate": 2.9469099007569943e-07, + "loss": 0.13753929138183593, + "step": 5240 + }, + { + "epoch": 0.9543304221251819, + "grad_norm": 0.1477566957473755, + "learning_rate": 2.83529483914724e-07, + "loss": 0.14354891777038575, + "step": 5245 + }, + { + "epoch": 0.9552401746724891, + "grad_norm": 0.1693645417690277, + "learning_rate": 2.7258224562102805e-07, + "loss": 0.14622807502746582, + "step": 5250 + }, + { + "epoch": 0.9561499272197962, + "grad_norm": 0.17574062943458557, + "learning_rate": 2.6184937009657295e-07, + "loss": 0.1344899296760559, + "step": 5255 + }, + { + "epoch": 0.9570596797671034, + "grad_norm": 0.17448563873767853, + "learning_rate": 2.513309503850009e-07, + "loss": 0.1355789542198181, + "step": 5260 + }, + { + "epoch": 0.9579694323144105, + "grad_norm": 0.16993778944015503, + "learning_rate": 2.41027077670819e-07, + "loss": 0.151595401763916, + "step": 5265 + }, + { + "epoch": 0.9588791848617176, + "grad_norm": 0.16944102942943573, + "learning_rate": 2.3093784127863062e-07, + "loss": 0.1466623306274414, + "step": 5270 + }, + { + "epoch": 0.9597889374090247, + "grad_norm": 0.18085163831710815, + "learning_rate": 2.2106332867234402e-07, + "loss": 0.14645814895629883, + "step": 5275 + }, + { + "epoch": 0.9606986899563319, + "grad_norm": 0.14682307839393616, + "learning_rate": 2.1140362545442605e-07, + "loss": 0.13901774883270263, + "step": 5280 + }, + { + "epoch": 0.961608442503639, + "grad_norm": 0.17189526557922363, + "learning_rate": 2.0195881536514694e-07, + "loss": 0.14153491258621215, + "step": 5285 + }, + { + "epoch": 0.9625181950509462, + "grad_norm": 0.1977207362651825, + "learning_rate": 1.9272898028186714e-07, + "loss": 0.1437437653541565, + "step": 5290 + }, + { + "epoch": 0.9634279475982532, + "grad_norm": 0.16637668013572693, + "learning_rate": 1.837142002183184e-07, + "loss": 0.13910138607025146, + "step": 5295 + }, + { + "epoch": 0.9643377001455604, + "grad_norm": 0.18155774474143982, + "learning_rate": 1.7491455332391548e-07, + "loss": 0.14177814722061158, + "step": 5300 + }, + { + "epoch": 0.9652474526928675, + "grad_norm": 0.32478174567222595, + "learning_rate": 1.6633011588307878e-07, + "loss": 0.14292703866958617, + "step": 5305 + }, + { + "epoch": 0.9661572052401747, + "grad_norm": 0.18050940334796906, + "learning_rate": 1.5796096231456558e-07, + "loss": 0.13252723217010498, + "step": 5310 + }, + { + "epoch": 0.9670669577874818, + "grad_norm": 0.15919657051563263, + "learning_rate": 1.4980716517083715e-07, + "loss": 0.14491976499557496, + "step": 5315 + }, + { + "epoch": 0.9679767103347889, + "grad_norm": 0.15895310044288635, + "learning_rate": 1.4186879513741758e-07, + "loss": 0.13617006540298462, + "step": 5320 + }, + { + "epoch": 0.9688864628820961, + "grad_norm": 0.1543736606836319, + "learning_rate": 1.3414592103228595e-07, + "loss": 0.14220429658889772, + "step": 5325 + }, + { + "epoch": 0.9697962154294032, + "grad_norm": 0.16660647094249725, + "learning_rate": 1.2663860980528797e-07, + "loss": 0.14069980382919312, + "step": 5330 + }, + { + "epoch": 0.9707059679767104, + "grad_norm": 0.15238550305366516, + "learning_rate": 1.1934692653754186e-07, + "loss": 0.13978019952774048, + "step": 5335 + }, + { + "epoch": 0.9716157205240175, + "grad_norm": 0.1649473011493683, + "learning_rate": 1.1227093444088066e-07, + "loss": 0.1401435136795044, + "step": 5340 + }, + { + "epoch": 0.9725254730713246, + "grad_norm": 0.14920124411582947, + "learning_rate": 1.0541069485730249e-07, + "loss": 0.13952178955078126, + "step": 5345 + }, + { + "epoch": 0.9734352256186317, + "grad_norm": 0.16802479326725006, + "learning_rate": 9.876626725844329e-08, + "loss": 0.14808181524276734, + "step": 5350 + }, + { + "epoch": 0.9743449781659389, + "grad_norm": 0.18096603453159332, + "learning_rate": 9.233770924505781e-08, + "loss": 0.13938647508621216, + "step": 5355 + }, + { + "epoch": 0.975254730713246, + "grad_norm": 0.1658579558134079, + "learning_rate": 8.612507654651991e-08, + "loss": 0.14219754934310913, + "step": 5360 + }, + { + "epoch": 0.9761644832605532, + "grad_norm": 0.1547713279724121, + "learning_rate": 8.012842302033696e-08, + "loss": 0.14298388957977295, + "step": 5365 + }, + { + "epoch": 0.9770742358078602, + "grad_norm": 0.18247587978839874, + "learning_rate": 7.434780065169178e-08, + "loss": 0.14103788137435913, + "step": 5370 + }, + { + "epoch": 0.9779839883551674, + "grad_norm": 0.17593605816364288, + "learning_rate": 6.878325955297915e-08, + "loss": 0.1450013041496277, + "step": 5375 + }, + { + "epoch": 0.9788937409024745, + "grad_norm": 0.17178039252758026, + "learning_rate": 6.343484796338395e-08, + "loss": 0.14021269083023072, + "step": 5380 + }, + { + "epoch": 0.9798034934497817, + "grad_norm": 0.17904147505760193, + "learning_rate": 5.830261224845923e-08, + "loss": 0.1460060477256775, + "step": 5385 + }, + { + "epoch": 0.9807132459970888, + "grad_norm": 0.16323266923427582, + "learning_rate": 5.338659689971548e-08, + "loss": 0.13915741443634033, + "step": 5390 + }, + { + "epoch": 0.9816229985443959, + "grad_norm": 0.1829039305448532, + "learning_rate": 4.8686844534248655e-08, + "loss": 0.1372266888618469, + "step": 5395 + }, + { + "epoch": 0.982532751091703, + "grad_norm": 0.16742415726184845, + "learning_rate": 4.420339589435995e-08, + "loss": 0.14404670000076295, + "step": 5400 + }, + { + "epoch": 0.9834425036390102, + "grad_norm": 0.20223695039749146, + "learning_rate": 3.9936289847206097e-08, + "loss": 0.14259873628616332, + "step": 5405 + }, + { + "epoch": 0.9843522561863173, + "grad_norm": 0.16556286811828613, + "learning_rate": 3.588556338446625e-08, + "loss": 0.1486513614654541, + "step": 5410 + }, + { + "epoch": 0.9852620087336245, + "grad_norm": 0.17593473196029663, + "learning_rate": 3.205125162201727e-08, + "loss": 0.1421863079071045, + "step": 5415 + }, + { + "epoch": 0.9861717612809315, + "grad_norm": 0.16081394255161285, + "learning_rate": 2.8433387799631228e-08, + "loss": 0.14347248077392577, + "step": 5420 + }, + { + "epoch": 0.9870815138282387, + "grad_norm": 0.16976398229599, + "learning_rate": 2.503200328067834e-08, + "loss": 0.13244209289550782, + "step": 5425 + }, + { + "epoch": 0.9879912663755459, + "grad_norm": 0.1760631501674652, + "learning_rate": 2.1847127551874458e-08, + "loss": 0.13686281442642212, + "step": 5430 + }, + { + "epoch": 0.988901018922853, + "grad_norm": 0.1616654098033905, + "learning_rate": 1.8878788223009036e-08, + "loss": 0.14251030683517457, + "step": 5435 + }, + { + "epoch": 0.9898107714701602, + "grad_norm": 0.17077618837356567, + "learning_rate": 1.6127011026703663e-08, + "loss": 0.13724164962768554, + "step": 5440 + }, + { + "epoch": 0.9907205240174672, + "grad_norm": 0.17870111763477325, + "learning_rate": 1.359181981820945e-08, + "loss": 0.13891533613204957, + "step": 5445 + }, + { + "epoch": 0.9916302765647744, + "grad_norm": 0.17270788550376892, + "learning_rate": 1.1273236575173873e-08, + "loss": 0.1403287410736084, + "step": 5450 + }, + { + "epoch": 0.9925400291120815, + "grad_norm": 0.14556582272052765, + "learning_rate": 9.171281397471476e-09, + "loss": 0.14006919860839845, + "step": 5455 + }, + { + "epoch": 0.9934497816593887, + "grad_norm": 0.173613503575325, + "learning_rate": 7.285972507017902e-09, + "loss": 0.13570475578308105, + "step": 5460 + }, + { + "epoch": 0.9943595342066958, + "grad_norm": 0.17131361365318298, + "learning_rate": 5.617326247614463e-09, + "loss": 0.13654524087905884, + "step": 5465 + }, + { + "epoch": 0.995269286754003, + "grad_norm": 0.1668723076581955, + "learning_rate": 4.1653570848121385e-09, + "loss": 0.13835798501968383, + "step": 5470 + }, + { + "epoch": 0.99617903930131, + "grad_norm": 0.17216536402702332, + "learning_rate": 2.9300776057727962e-09, + "loss": 0.1460067868232727, + "step": 5475 + }, + { + "epoch": 0.9970887918486172, + "grad_norm": 0.17424245178699493, + "learning_rate": 1.911498519177601e-09, + "loss": 0.13620004653930665, + "step": 5480 + }, + { + "epoch": 0.9979985443959243, + "grad_norm": 0.20578418672084808, + "learning_rate": 1.1096286551187663e-09, + "loss": 0.13895630836486816, + "step": 5485 + }, + { + "epoch": 0.9989082969432315, + "grad_norm": 0.19702354073524475, + "learning_rate": 5.244749650301639e-10, + "loss": 0.14265141487121583, + "step": 5490 + }, + { + "epoch": 0.9998180494905385, + "grad_norm": 0.16503211855888367, + "learning_rate": 1.560425216318162e-10, + "loss": 0.15260394811630248, + "step": 5495 + }, + { + "epoch": 1.0007278020378456, + "grad_norm": 0.18294842541217804, + "learning_rate": 4.334518874382631e-12, + "loss": 0.14066877365112304, + "step": 5500 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.0227691751004954e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-5500/training_args.bin b/checkpoint-5500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-5500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-600/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-600/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-600/adapter_model.safetensors b/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f3df1e614ada9585a69f31c15e39d6ed0612e47c --- /dev/null +++ b/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:079988b42f6ac1d048db321d7423b2482c1a86094d65b822f0554bcb55d53829 +size 169741912 diff --git a/checkpoint-600/chat_template.jinja b/checkpoint-600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-600/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fda89d6b203087797d99363bb2e51c2475ba10d --- /dev/null +++ b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:253cd8f2f837c96e77fd97da074f4de32dac3ab6554465896c052e17a99224e0 +size 72807355 diff --git a/checkpoint-600/processor_config.json b/checkpoint-600/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-600/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..696d2dcd63a1a52d8a725d739aa534f92ae2f9b6 --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a077206253fffe03d7133f10fc8a0dcefffd2da6f33c9dacdf9ab88464770da4 +size 1465 diff --git a/checkpoint-600/tokenizer.json b/checkpoint-600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-600/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..586fa47a37ccc44f5501b31eb922888617139321 --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,882 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1091703056768559, + "eval_steps": 100, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.3860575973684506e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-700/README.md b/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-700/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-700/adapter_config.json b/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-700/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-700/adapter_model.safetensors b/checkpoint-700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ef669b5fe5a5e86ab0b2c5d53613e1afc15f70c1 --- /dev/null +++ b/checkpoint-700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfd534ebc440aac595d09991db009cd775fc55bd7bff64bcd8fdb6db3fddbf91 +size 169741912 diff --git a/checkpoint-700/chat_template.jinja b/checkpoint-700/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-700/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-700/optimizer.pt b/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3a757ea998411e00a63daf895731d0a4b0d5973 --- /dev/null +++ b/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb322ffb9f7cf0b6017382f053576419422b24981728ada8328ae1108923e289 +size 72807355 diff --git a/checkpoint-700/processor_config.json b/checkpoint-700/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-700/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-700/rng_state.pth b/checkpoint-700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..19acdc4ff8df16bf1bddaaf0954a21835611642d --- /dev/null +++ b/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a175384262eeb10988bcb724a64316d5dcf76ffb38ae5f2f079617356a41f965 +size 1465 diff --git a/checkpoint-700/tokenizer.json b/checkpoint-700/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-700/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-700/tokenizer_config.json b/checkpoint-700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-700/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..69d2feae831ab331099f797648b54c70fc20bc13 --- /dev/null +++ b/checkpoint-700/trainer_state.json @@ -0,0 +1,1022 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.12736535662299855, + "eval_steps": 100, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.942520566427561e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-700/training_args.bin b/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-800/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-800/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-800/adapter_model.safetensors b/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a3659672fd34a35af61f8aa4a5253fadc26d8297 --- /dev/null +++ b/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7201736ae99ac88b287b440523029679d25d06fbadcb1429efff61306ad6b2d +size 169741912 diff --git a/checkpoint-800/chat_template.jinja b/checkpoint-800/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-800/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c99a70931b6823bfe79edacc5fbebafbf8b247cc --- /dev/null +++ b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eceee864faeb468b238da74ec39df8a2fc1ca8a0040ebd622b313c0cbaeb8f93 +size 72807355 diff --git a/checkpoint-800/processor_config.json b/checkpoint-800/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-800/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..236cf74dcd1d495228ee0918233b17f512f03d86 --- /dev/null +++ b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c184a427ce0ea76af847dd0f82d4738b58b7d908e5a757bb1e42fd2930717745 +size 1465 diff --git a/checkpoint-800/tokenizer.json b/checkpoint-800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-800/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..34289530de5a025deb346a8891b33b079229a560 --- /dev/null +++ b/checkpoint-800/trainer_state.json @@ -0,0 +1,1162 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.14556040756914118, + "eval_steps": 100, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.491847953207176e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/checkpoint-900/README.md b/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..987e95358a030201b3953e31134ac114bb6a1a3e --- /dev/null +++ b/checkpoint-900/README.md @@ -0,0 +1,210 @@ +--- +base_model: unsloth/gemma-4-E4B-it +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:unsloth/gemma-4-E4B-it +- lora +- sft +- transformers +- trl +- unsloth +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.19.1 \ No newline at end of file diff --git a/checkpoint-900/adapter_config.json b/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a151f142d19277baf26e5d953008dec6000a228 --- /dev/null +++ b/checkpoint-900/adapter_config.json @@ -0,0 +1,52 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": { + "base_model_class": "Gemma4ForConditionalGeneration", + "parent_library": "transformers.models.gemma4.modeling_gemma4", + "unsloth_fixed": true + }, + "base_model_name_or_path": "unsloth/gemma-4-E4B-it", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_bias": false, + "lora_dropout": 0.0, + "lora_ga_config": null, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.19.1", + "qalora_group_size": 16, + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "up_proj", + "down_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_bdlora": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-900/adapter_model.safetensors b/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4d23f27e223e86c43bfa2f80bed9064f02128aa8 --- /dev/null +++ b/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:448f0cce2d69719e9a51b87d34c93ec5d9a1457f17e4b66a61c07c91a90a6501 +size 169741912 diff --git a/checkpoint-900/chat_template.jinja b/checkpoint-900/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..dc032e46e28e0d325b18d26da1279cdda3afa8d7 --- /dev/null +++ b/checkpoint-900/chat_template.jinja @@ -0,0 +1,351 @@ +{%- macro format_parameters(properties, required, filter_keys=false) -%} + {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in properties | dictsort -%} + {%- set add_comma = false -%} + {%- if not filter_keys or key not in standard_keys -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {{ key }}:{ + {%- if value['description'] -%} + description:<|"|>{{ value['description'] }}<|"|> + {%- set add_comma = true -%} + {%- endif -%} + {%- if value['type'] | upper == 'STRING' -%} + {%- if value['enum'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + enum:{{ format_argument(value['enum']) }} + {%- endif -%} + {%- elif value['type'] | upper == 'ARRAY' -%} + {%- if value['items'] is mapping and value['items'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + items:{ + {%- set ns_items = namespace(found_first=false) -%} + {%- for item_key, item_value in value['items'] | dictsort -%} + {%- if item_value is not none -%} + {%- if ns_items.found_first %},{% endif -%} + {%- set ns_items.found_first = true -%} + {%- if item_key == 'properties' -%} + properties:{ + {%- if item_value is mapping -%} + {{- format_parameters(item_value, value['items']['required'] | default([])) -}} + {%- endif -%} + } + {%- elif item_key == 'required' -%} + required:[ + {%- for req_item in item_value -%} + <|"|>{{- req_item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- elif item_key == 'type' -%} + {%- if item_value is string -%} + type:{{ format_argument(item_value | upper) }} + {%- else -%} + type:{{ format_argument(item_value | map('upper') | list) }} + {%- endif -%} + {%- else -%} + {{ item_key }}:{{ format_argument(item_value) }} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + } + {%- endif -%} + {%- endif -%} + {%- if value['nullable'] %} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + nullable:true + {%- endif -%} + {%- if value['type'] | upper == 'OBJECT' -%} + {%- if value['properties'] is defined and value['properties'] is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value['properties'], value['required'] | default([])) -}} + } + {%- elif value is mapping -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + properties:{ + {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}} + } + {%- endif -%} + {%- if value['required'] -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + required:[ + {%- for item in value['required'] | default([]) -%} + <|"|>{{- item -}}<|"|> + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + ] + {%- endif -%} + {%- endif -%} + {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%} + type:<|"|>{{ value['type'] | upper }}<|"|>} + {%- endif -%} + {%- endfor -%} +{%- endmacro -%} +{%- macro format_function_declaration(tool_data) -%} + declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|> + {%- set params = tool_data['function']['parameters'] -%} + {%- if params -%} + ,parameters:{ + {%- if params['properties'] -%} + properties:{ {{- format_parameters(params['properties'], params['required']) -}} }, + {%- endif -%} + {%- if params['required'] -%} + required:[ + {%- for item in params['required'] -%} + <|"|>{{- item -}}<|"|> + {{- ',' if not loop.last -}} + {%- endfor -%} + ], + {%- endif -%} + {%- if params['type'] -%} + type:<|"|>{{- params['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + {%- if 'response' in tool_data['function'] -%} + {%- set response_declaration = tool_data['function']['response'] -%} + ,response:{ + {%- if response_declaration['description'] -%} + description:<|"|>{{- response_declaration['description'] -}}<|"|>, + {%- endif -%} + {%- if response_declaration['type'] | upper == 'OBJECT' -%} + type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>} + {%- endif -%} + {%- endif -%} + } +{%- endmacro -%} +{%- macro format_argument(argument, escape_keys=True) -%} + {%- if argument is string -%} + {{- '<|"|>' + argument + '<|"|>' -}} + {%- elif argument is boolean -%} + {{- 'true' if argument else 'false' -}} + {%- elif argument is mapping -%} + {{- '{' -}} + {%- set ns = namespace(found_first=false) -%} + {%- for key, value in argument | dictsort -%} + {%- if ns.found_first %},{% endif -%} + {%- set ns.found_first = true -%} + {%- if escape_keys -%} + {{- '<|"|>' + key + '<|"|>' -}} + {%- else -%} + {{- key -}} + {%- endif -%} + :{{- format_argument(value, escape_keys=escape_keys) -}} + {%- endfor -%} + {{- '}' -}} + {%- elif argument is sequence -%} + {{- '[' -}} + {%- for item in argument -%} + {{- format_argument(item, escape_keys=escape_keys) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- ']' -}} + {%- else -%} + {{- argument -}} + {%- endif -%} +{%- endmacro -%} +{%- macro strip_thinking(text) -%} + {%- set ns = namespace(result='') -%} + {%- for part in text.split('') -%} + {%- if '<|channel>' in part -%} + {%- set ns.result = ns.result + part.split('<|channel>')[0] -%} + {%- else -%} + {%- set ns.result = ns.result + part -%} + {%- endif -%} + {%- endfor -%} + {{- ns.result | trim -}} +{%- endmacro -%} + +{%- macro format_tool_response_block(tool_name, response) -%} + {{- '<|tool_response>' -}} + {%- if response is mapping -%} + {{- 'response:' + tool_name + '{' -}} + {%- for key, value in response | dictsort -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- if not loop.last %},{% endif -%} + {%- endfor -%} + {{- '}' -}} + {%- else -%} + {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}} + {%- endif -%} + {{- '' -}} +{%- endmacro -%} + +{%- set ns = namespace(prev_message_type=None) -%} +{%- set loop_messages = messages -%} +{{- bos_token -}} +{#- Handle System/Tool Definitions Block -#} +{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%} + {{- '<|turn>system\n' -}} + {#- Inject Thinking token at the very top of the FIRST system turn -#} + {%- if enable_thinking is defined and enable_thinking -%} + {{- '<|think|>\n' -}} + {%- set ns.prev_message_type = 'think' -%} + {%- endif -%} + {%- if messages[0]['role'] in ['system', 'developer'] -%} + {%- if messages[0]['content'] is string -%} + {{- messages[0]['content'] | trim -}} + {%- elif messages[0]['content'] is sequence -%} + {%- for item in messages[0]['content'] -%} + {{- item['text'] | trim + ' '-}} + {%- endfor -%} + {%- endif -%} + {%- set loop_messages = messages[1:] -%} + {%- endif -%} + {%- if tools -%} + {%- for tool in tools %} + {{- '<|tool>' -}} + {{- format_function_declaration(tool) | trim -}} + {{- '' -}} + {%- endfor %} + {%- set ns.prev_message_type = 'tool' -%} + {%- endif -%} + {{- '\n' -}} +{%- endif %} + +{#- Pre-scan: find last user message index for reasoning guard -#} +{%- set ns_turn = namespace(last_user_idx=-1) -%} +{%- for i in range(loop_messages | length) -%} + {%- if loop_messages[i]['role'] == 'user' -%} + {%- set ns_turn.last_user_idx = i -%} + {%- endif -%} +{%- endfor -%} + +{#- Loop through messages -#} +{%- for message in loop_messages -%} + {%- if message['role'] != 'tool' -%} + {%- set ns.prev_message_type = None -%} + {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%} + {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#} + {%- set prev_nt = namespace(role=None, found=false) -%} + {%- if loop.index0 > 0 -%} + {%- for j in range(loop.index0 - 1, -1, -1) -%} + {%- if not prev_nt.found -%} + {%- if loop_messages[j]['role'] != 'tool' -%} + {%- set prev_nt.role = loop_messages[j]['role'] -%} + {%- set prev_nt.found = true -%} + {%- endif -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%} + {%- if not continue_same_model_turn -%} + {{- '<|turn>' + role + '\n' }} + {%- endif -%} + + {#- Render reasoning/reasoning_content as thinking channel -#} + {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%} + {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%} + {{- '<|channel>thought\n' + thinking_text + '\n' -}} + {%- endif -%} + + {%- if message['tool_calls'] -%} + {%- for tool_call in message['tool_calls'] -%} + {%- set function = tool_call['function'] -%} + {{- '<|tool_call>call:' + function['name'] + '{' -}} + {%- if function['arguments'] is mapping -%} + {%- set ns_args = namespace(found_first=false) -%} + {%- for key, value in function['arguments'] | dictsort -%} + {%- if ns_args.found_first %},{% endif -%} + {%- set ns_args.found_first = true -%} + {{- key -}}:{{- format_argument(value, escape_keys=False) -}} + {%- endfor -%} + {%- elif function['arguments'] is string -%} + {{- function['arguments'] -}} + {%- endif -%} + {{- '}' -}} + {%- endfor -%} + {%- set ns.prev_message_type = 'tool_call' -%} + {%- endif -%} + + {%- set ns_tr_out = namespace(flag=false) -%} + {%- if message.get('tool_responses') -%} + {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#} + {%- for tool_response in message['tool_responses'] -%} + {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endfor -%} + {%- elif message.get('tool_calls') -%} + {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#} + {%- set ns_tool_scan = namespace(stopped=false) -%} + {%- for k in range(loop.index0 + 1, loop_messages | length) -%} + {%- if ns_tool_scan.stopped -%} + {%- elif loop_messages[k]['role'] != 'tool' -%} + {%- set ns_tool_scan.stopped = true -%} + {%- else -%} + {%- set follow = loop_messages[k] -%} + {#- Resolve tool_call_id to function name -#} + {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%} + {%- for tc in message['tool_calls'] -%} + {%- if tc.get('id') == follow.get('tool_call_id') -%} + {%- set ns_tname.name = tc['function']['name'] -%} + {%- endif -%} + {%- endfor -%} + {#- Handle content as string or content-parts array -#} + {%- set tool_body = follow.get('content') -%} + {%- if tool_body is string -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- elif tool_body is sequence and tool_body is not string -%} + {%- set ns_txt = namespace(s='') -%} + {%- for part in tool_body -%} + {%- if part.get('type') == 'text' -%} + {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%} + {%- endif -%} + {%- endfor -%} + {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}} + {%- else -%} + {{- format_tool_response_block(ns_tname.name, tool_body) -}} + {%- endif -%} + {%- set ns_tr_out.flag = true -%} + {%- set ns.prev_message_type = 'tool_response' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + + {%- set captured_content -%} + {%- if message['content'] is string -%} + {%- if role == 'model' -%} + {{- strip_thinking(message['content']) -}} + {%- else -%} + {{- message['content'] | trim -}} + {%- endif -%} + {%- elif message['content'] is sequence -%} + {%- for item in message['content'] -%} + {%- if item['type'] == 'text' -%} + {%- if role == 'model' -%} + {{- strip_thinking(item['text']) -}} + {%- else -%} + {{- item['text'] | trim -}} + {%- endif -%} + {%- elif item['type'] == 'image' -%} + {{- '<|image|>' -}} + {%- set ns.prev_message_type = 'image' -%} + {%- elif item['type'] == 'audio' -%} + {{- '<|audio|>' -}} + {%- set ns.prev_message_type = 'audio' -%} + {%- elif item['type'] == 'video' -%} + {{- '<|video|>' -}} + {%- set ns.prev_message_type = 'video' -%} + {%- endif -%} + {%- endfor -%} + {%- endif -%} + {%- endset -%} + + {{- captured_content -}} + {%- set has_content = captured_content | trim | length > 0 -%} + + {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%} + {{- '<|tool_response>' -}} + {%- elif not (ns_tr_out.flag and not has_content) -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt -%} + {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%} + {{- '<|turn>model\n' -}} + {%- endif -%} +{%- endif -%} \ No newline at end of file diff --git a/checkpoint-900/optimizer.pt b/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d84c0563604680a38932c53f3e553b60cebdad2 --- /dev/null +++ b/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:618adf05e643bfc75c2079b649320ceb328881eb992e9bee4754063fb8a51a9e +size 72807355 diff --git a/checkpoint-900/processor_config.json b/checkpoint-900/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/checkpoint-900/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/checkpoint-900/rng_state.pth b/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d563813207098e5a394b16074e682d3cbdf758a --- /dev/null +++ b/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c50238c6ad9c1ec389a246215cc396aa1c87e4d537fe0cd9b3bed4a73b840f4d +size 1465 diff --git a/checkpoint-900/tokenizer.json b/checkpoint-900/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/checkpoint-900/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/checkpoint-900/tokenizer_config.json b/checkpoint-900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/checkpoint-900/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +} diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9e0f63f3b081d00a6c3fb0a3e0d6070d7afeca4e --- /dev/null +++ b/checkpoint-900/trainer_state.json @@ -0,0 +1,1302 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.16375545851528384, + "eval_steps": 100, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009097525473071324, + "grad_norm": 1.0602493286132812, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.7156932830810547, + "step": 5 + }, + { + "epoch": 0.001819505094614265, + "grad_norm": 1.1577719449996948, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.6629371643066406, + "step": 10 + }, + { + "epoch": 0.0027292576419213972, + "grad_norm": 1.0288419723510742, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6706295013427734, + "step": 15 + }, + { + "epoch": 0.00363901018922853, + "grad_norm": 2.129403829574585, + "learning_rate": 5.7575757575757586e-06, + "loss": 1.7363752365112304, + "step": 20 + }, + { + "epoch": 0.004548762736535662, + "grad_norm": 1.9468326568603516, + "learning_rate": 7.272727272727272e-06, + "loss": 1.7111135482788087, + "step": 25 + }, + { + "epoch": 0.0054585152838427945, + "grad_norm": 1.1269357204437256, + "learning_rate": 8.787878787878788e-06, + "loss": 1.6924203872680663, + "step": 30 + }, + { + "epoch": 0.006368267831149927, + "grad_norm": 1.4021248817443848, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.658310317993164, + "step": 35 + }, + { + "epoch": 0.00727802037845706, + "grad_norm": 1.313381314277649, + "learning_rate": 1.1818181818181819e-05, + "loss": 1.5383296012878418, + "step": 40 + }, + { + "epoch": 0.008187772925764192, + "grad_norm": 2.4359891414642334, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.4302565574645996, + "step": 45 + }, + { + "epoch": 0.009097525473071324, + "grad_norm": 1.6459542512893677, + "learning_rate": 1.484848484848485e-05, + "loss": 1.2602953910827637, + "step": 50 + }, + { + "epoch": 0.010007278020378457, + "grad_norm": 0.7953159213066101, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.204326343536377, + "step": 55 + }, + { + "epoch": 0.010917030567685589, + "grad_norm": 0.5824465155601501, + "learning_rate": 1.787878787878788e-05, + "loss": 1.068561840057373, + "step": 60 + }, + { + "epoch": 0.011826783114992722, + "grad_norm": 0.39265626668930054, + "learning_rate": 1.9393939393939395e-05, + "loss": 0.9570062637329102, + "step": 65 + }, + { + "epoch": 0.012736535662299854, + "grad_norm": 0.3387283384799957, + "learning_rate": 2.090909090909091e-05, + "loss": 0.9454713821411133, + "step": 70 + }, + { + "epoch": 0.013646288209606987, + "grad_norm": 0.3182811141014099, + "learning_rate": 2.2424242424242424e-05, + "loss": 0.8901592254638672, + "step": 75 + }, + { + "epoch": 0.01455604075691412, + "grad_norm": 0.2735312879085541, + "learning_rate": 2.393939393939394e-05, + "loss": 0.8491583824157715, + "step": 80 + }, + { + "epoch": 0.015465793304221253, + "grad_norm": 0.2376435250043869, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.8109179496765136, + "step": 85 + }, + { + "epoch": 0.016375545851528384, + "grad_norm": 0.2161586880683899, + "learning_rate": 2.696969696969697e-05, + "loss": 0.76962308883667, + "step": 90 + }, + { + "epoch": 0.017285298398835518, + "grad_norm": 0.19587980210781097, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7301986694335938, + "step": 95 + }, + { + "epoch": 0.018195050946142648, + "grad_norm": 0.20971694588661194, + "learning_rate": 3e-05, + "loss": 0.7269618034362793, + "step": 100 + }, + { + "epoch": 0.018195050946142648, + "eval_loss": 2.605874538421631, + "eval_runtime": 1120.0905, + "eval_samples_per_second": 33.935, + "eval_steps_per_second": 8.484, + "step": 100 + }, + { + "epoch": 0.01910480349344978, + "grad_norm": 0.10413152724504471, + "learning_rate": 3.151515151515151e-05, + "loss": 0.3250573635101318, + "step": 105 + }, + { + "epoch": 0.020014556040756915, + "grad_norm": 0.09383206814527512, + "learning_rate": 3.303030303030303e-05, + "loss": 0.3277724742889404, + "step": 110 + }, + { + "epoch": 0.020924308588064048, + "grad_norm": 0.1195850670337677, + "learning_rate": 3.454545454545455e-05, + "loss": 0.3215961217880249, + "step": 115 + }, + { + "epoch": 0.021834061135371178, + "grad_norm": 0.0715397521853447, + "learning_rate": 3.606060606060606e-05, + "loss": 0.3120795965194702, + "step": 120 + }, + { + "epoch": 0.02274381368267831, + "grad_norm": 0.068007692694664, + "learning_rate": 3.757575757575758e-05, + "loss": 0.2964257955551147, + "step": 125 + }, + { + "epoch": 0.023653566229985445, + "grad_norm": 0.09345484524965286, + "learning_rate": 3.909090909090909e-05, + "loss": 0.30776252746582033, + "step": 130 + }, + { + "epoch": 0.024563318777292575, + "grad_norm": 0.05577846243977547, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.3180255889892578, + "step": 135 + }, + { + "epoch": 0.025473071324599708, + "grad_norm": 0.05919989198446274, + "learning_rate": 4.212121212121212e-05, + "loss": 0.31608285903930666, + "step": 140 + }, + { + "epoch": 0.02638282387190684, + "grad_norm": 0.05644674599170685, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.2993780136108398, + "step": 145 + }, + { + "epoch": 0.027292576419213975, + "grad_norm": 0.059986088424921036, + "learning_rate": 4.515151515151516e-05, + "loss": 0.2931638479232788, + "step": 150 + }, + { + "epoch": 0.028202328966521105, + "grad_norm": 0.05941484495997429, + "learning_rate": 4.666666666666667e-05, + "loss": 0.29284651279449464, + "step": 155 + }, + { + "epoch": 0.02911208151382824, + "grad_norm": 0.0579044483602047, + "learning_rate": 4.8181818181818186e-05, + "loss": 0.2927037000656128, + "step": 160 + }, + { + "epoch": 0.030021834061135372, + "grad_norm": 0.061985693871974945, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.28671720027923586, + "step": 165 + }, + { + "epoch": 0.030931586608442505, + "grad_norm": 0.05715535953640938, + "learning_rate": 4.999993064772809e-05, + "loss": 0.2817929744720459, + "step": 170 + }, + { + "epoch": 0.03184133915574964, + "grad_norm": 0.06549780815839767, + "learning_rate": 4.999964890478288e-05, + "loss": 0.27853829860687257, + "step": 175 + }, + { + "epoch": 0.03275109170305677, + "grad_norm": 0.05948757752776146, + "learning_rate": 4.999915043908795e-05, + "loss": 0.27522289752960205, + "step": 180 + }, + { + "epoch": 0.0336608442503639, + "grad_norm": 0.06262889504432678, + "learning_rate": 4.9998435254964515e-05, + "loss": 0.270997428894043, + "step": 185 + }, + { + "epoch": 0.034570596797671035, + "grad_norm": 0.06916829943656921, + "learning_rate": 4.999750335861253e-05, + "loss": 0.2788438558578491, + "step": 190 + }, + { + "epoch": 0.035480349344978165, + "grad_norm": 0.06128217652440071, + "learning_rate": 4.9996354758110624e-05, + "loss": 0.25649352073669435, + "step": 195 + }, + { + "epoch": 0.036390101892285295, + "grad_norm": 0.06704027950763702, + "learning_rate": 4.999498946341606e-05, + "loss": 0.25619523525238036, + "step": 200 + }, + { + "epoch": 0.03729985443959243, + "grad_norm": 0.061678580939769745, + "learning_rate": 4.999340748636462e-05, + "loss": 0.24956226348876953, + "step": 205 + }, + { + "epoch": 0.03820960698689956, + "grad_norm": 0.07328873127698898, + "learning_rate": 4.999160884067051e-05, + "loss": 0.26169676780700685, + "step": 210 + }, + { + "epoch": 0.0391193595342067, + "grad_norm": 0.08287990838289261, + "learning_rate": 4.9989593541926246e-05, + "loss": 0.2574604034423828, + "step": 215 + }, + { + "epoch": 0.04002911208151383, + "grad_norm": 0.06787359714508057, + "learning_rate": 4.9987361607602525e-05, + "loss": 0.25351409912109374, + "step": 220 + }, + { + "epoch": 0.04093886462882096, + "grad_norm": 0.06695502996444702, + "learning_rate": 4.998491305704805e-05, + "loss": 0.24522039890289307, + "step": 225 + }, + { + "epoch": 0.041848617176128096, + "grad_norm": 0.08872214704751968, + "learning_rate": 4.9982247911489375e-05, + "loss": 0.2581867933273315, + "step": 230 + }, + { + "epoch": 0.042758369723435226, + "grad_norm": 0.07637131959199905, + "learning_rate": 4.9979366194030743e-05, + "loss": 0.25569658279418944, + "step": 235 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.08158119022846222, + "learning_rate": 4.997626792965385e-05, + "loss": 0.2529409646987915, + "step": 240 + }, + { + "epoch": 0.04457787481804949, + "grad_norm": 0.07529161125421524, + "learning_rate": 4.997295314521766e-05, + "loss": 0.24049024581909179, + "step": 245 + }, + { + "epoch": 0.04548762736535662, + "grad_norm": 0.08860139548778534, + "learning_rate": 4.996942186945813e-05, + "loss": 0.2490522861480713, + "step": 250 + }, + { + "epoch": 0.04639737991266375, + "grad_norm": 0.0850321501493454, + "learning_rate": 4.9965674132988005e-05, + "loss": 0.24180831909179687, + "step": 255 + }, + { + "epoch": 0.04730713245997089, + "grad_norm": 0.07556115090847015, + "learning_rate": 4.996170996829653e-05, + "loss": 0.2509631872177124, + "step": 260 + }, + { + "epoch": 0.04821688500727802, + "grad_norm": 0.07971206307411194, + "learning_rate": 4.995752940974918e-05, + "loss": 0.24398891925811766, + "step": 265 + }, + { + "epoch": 0.04912663755458515, + "grad_norm": 0.09149336814880371, + "learning_rate": 4.9953132493587344e-05, + "loss": 0.2300492286682129, + "step": 270 + }, + { + "epoch": 0.050036390101892286, + "grad_norm": 0.08265820890665054, + "learning_rate": 4.9948519257928034e-05, + "loss": 0.24246792793273925, + "step": 275 + }, + { + "epoch": 0.050946142649199416, + "grad_norm": 0.10328587144613266, + "learning_rate": 4.9943689742763534e-05, + "loss": 0.2367171049118042, + "step": 280 + }, + { + "epoch": 0.05185589519650655, + "grad_norm": 0.0836917981505394, + "learning_rate": 4.993864398996105e-05, + "loss": 0.23215813636779786, + "step": 285 + }, + { + "epoch": 0.05276564774381368, + "grad_norm": 0.09475161135196686, + "learning_rate": 4.99333820432624e-05, + "loss": 0.2350748062133789, + "step": 290 + }, + { + "epoch": 0.05367540029112081, + "grad_norm": 0.08040128648281097, + "learning_rate": 4.992790394828355e-05, + "loss": 0.23253886699676513, + "step": 295 + }, + { + "epoch": 0.05458515283842795, + "grad_norm": 0.08852150291204453, + "learning_rate": 4.992220975251428e-05, + "loss": 0.23856515884399415, + "step": 300 + }, + { + "epoch": 0.05549490538573508, + "grad_norm": 0.09565229713916779, + "learning_rate": 4.991629950531775e-05, + "loss": 0.23311660289764405, + "step": 305 + }, + { + "epoch": 0.05640465793304221, + "grad_norm": 0.08158160001039505, + "learning_rate": 4.991017325793009e-05, + "loss": 0.22467944622039795, + "step": 310 + }, + { + "epoch": 0.05731441048034935, + "grad_norm": 0.07746429741382599, + "learning_rate": 4.990383106345994e-05, + "loss": 0.229844069480896, + "step": 315 + }, + { + "epoch": 0.05822416302765648, + "grad_norm": 0.08564355969429016, + "learning_rate": 4.989727297688797e-05, + "loss": 0.22414517402648926, + "step": 320 + }, + { + "epoch": 0.05913391557496361, + "grad_norm": 0.07517435401678085, + "learning_rate": 4.9890499055066435e-05, + "loss": 0.2236532211303711, + "step": 325 + }, + { + "epoch": 0.060043668122270744, + "grad_norm": 0.111734539270401, + "learning_rate": 4.988350935671869e-05, + "loss": 0.21474847793579102, + "step": 330 + }, + { + "epoch": 0.060953420669577874, + "grad_norm": 0.09906989336013794, + "learning_rate": 4.987630394243866e-05, + "loss": 0.23321933746337892, + "step": 335 + }, + { + "epoch": 0.06186317321688501, + "grad_norm": 0.10131457448005676, + "learning_rate": 4.98688828746903e-05, + "loss": 0.2310662031173706, + "step": 340 + }, + { + "epoch": 0.06277292576419213, + "grad_norm": 0.09203507006168365, + "learning_rate": 4.986124621780708e-05, + "loss": 0.22021169662475587, + "step": 345 + }, + { + "epoch": 0.06368267831149928, + "grad_norm": 0.09505912661552429, + "learning_rate": 4.9853394037991416e-05, + "loss": 0.2197155237197876, + "step": 350 + }, + { + "epoch": 0.06459243085880641, + "grad_norm": 0.09038657695055008, + "learning_rate": 4.984532640331412e-05, + "loss": 0.22066287994384765, + "step": 355 + }, + { + "epoch": 0.06550218340611354, + "grad_norm": 0.09707064181566238, + "learning_rate": 4.9837043383713753e-05, + "loss": 0.22455451488494874, + "step": 360 + }, + { + "epoch": 0.06641193595342067, + "grad_norm": 0.10367228090763092, + "learning_rate": 4.98285450509961e-05, + "loss": 0.21993820667266845, + "step": 365 + }, + { + "epoch": 0.0673216885007278, + "grad_norm": 0.12229471653699875, + "learning_rate": 4.9819831478833456e-05, + "loss": 0.2168867588043213, + "step": 370 + }, + { + "epoch": 0.06823144104803494, + "grad_norm": 0.0964592918753624, + "learning_rate": 4.981090274276406e-05, + "loss": 0.21579203605651856, + "step": 375 + }, + { + "epoch": 0.06914119359534207, + "grad_norm": 0.09400496631860733, + "learning_rate": 4.980175892019141e-05, + "loss": 0.20972180366516113, + "step": 380 + }, + { + "epoch": 0.0700509461426492, + "grad_norm": 0.08158645778894424, + "learning_rate": 4.9792400090383594e-05, + "loss": 0.22148358821868896, + "step": 385 + }, + { + "epoch": 0.07096069868995633, + "grad_norm": 0.10916394740343094, + "learning_rate": 4.978282633447261e-05, + "loss": 0.2214418649673462, + "step": 390 + }, + { + "epoch": 0.07187045123726346, + "grad_norm": 0.11138810962438583, + "learning_rate": 4.9773037735453636e-05, + "loss": 0.21814754009246826, + "step": 395 + }, + { + "epoch": 0.07278020378457059, + "grad_norm": 0.10914396494626999, + "learning_rate": 4.9763034378184365e-05, + "loss": 0.21310818195343018, + "step": 400 + }, + { + "epoch": 0.07368995633187773, + "grad_norm": 0.1043366864323616, + "learning_rate": 4.975281634938421e-05, + "loss": 0.21266789436340333, + "step": 405 + }, + { + "epoch": 0.07459970887918486, + "grad_norm": 0.1036868542432785, + "learning_rate": 4.9742383737633594e-05, + "loss": 0.21606721878051757, + "step": 410 + }, + { + "epoch": 0.075509461426492, + "grad_norm": 0.11640442907810211, + "learning_rate": 4.9731736633373144e-05, + "loss": 0.21532948017120362, + "step": 415 + }, + { + "epoch": 0.07641921397379912, + "grad_norm": 0.11219926178455353, + "learning_rate": 4.9720875128902956e-05, + "loss": 0.2191627025604248, + "step": 420 + }, + { + "epoch": 0.07732896652110625, + "grad_norm": 0.12103637307882309, + "learning_rate": 4.970979931838176e-05, + "loss": 0.20938868522644044, + "step": 425 + }, + { + "epoch": 0.0782387190684134, + "grad_norm": 0.13274189829826355, + "learning_rate": 4.96985092978261e-05, + "loss": 0.21792960166931152, + "step": 430 + }, + { + "epoch": 0.07914847161572053, + "grad_norm": 0.11164513230323792, + "learning_rate": 4.968700516510954e-05, + "loss": 0.2022618055343628, + "step": 435 + }, + { + "epoch": 0.08005822416302766, + "grad_norm": 0.09532847255468369, + "learning_rate": 4.967528701996174e-05, + "loss": 0.21255812644958497, + "step": 440 + }, + { + "epoch": 0.08096797671033479, + "grad_norm": 0.10279258340597153, + "learning_rate": 4.96633549639677e-05, + "loss": 0.20683050155639648, + "step": 445 + }, + { + "epoch": 0.08187772925764192, + "grad_norm": 0.1257462352514267, + "learning_rate": 4.965120910056677e-05, + "loss": 0.21419920921325683, + "step": 450 + }, + { + "epoch": 0.08278748180494905, + "grad_norm": 0.11663137376308441, + "learning_rate": 4.963884953505186e-05, + "loss": 0.2072287082672119, + "step": 455 + }, + { + "epoch": 0.08369723435225619, + "grad_norm": 0.10488224029541016, + "learning_rate": 4.96262763745684e-05, + "loss": 0.1982678532600403, + "step": 460 + }, + { + "epoch": 0.08460698689956332, + "grad_norm": 0.11801692098379135, + "learning_rate": 4.961348972811354e-05, + "loss": 0.20662031173706055, + "step": 465 + }, + { + "epoch": 0.08551673944687045, + "grad_norm": 0.11318827420473099, + "learning_rate": 4.96004897065351e-05, + "loss": 0.20947303771972656, + "step": 470 + }, + { + "epoch": 0.08642649199417758, + "grad_norm": 0.13409486413002014, + "learning_rate": 4.95872764225307e-05, + "loss": 0.19670876264572143, + "step": 475 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.14440792798995972, + "learning_rate": 4.957384999064672e-05, + "loss": 0.19842848777770997, + "step": 480 + }, + { + "epoch": 0.08824599708879186, + "grad_norm": 0.12246996909379959, + "learning_rate": 4.956021052727731e-05, + "loss": 0.20318071842193602, + "step": 485 + }, + { + "epoch": 0.08915574963609899, + "grad_norm": 0.13437233865261078, + "learning_rate": 4.954635815066342e-05, + "loss": 0.21675212383270265, + "step": 490 + }, + { + "epoch": 0.09006550218340612, + "grad_norm": 0.11109672486782074, + "learning_rate": 4.9532292980891744e-05, + "loss": 0.2100757837295532, + "step": 495 + }, + { + "epoch": 0.09097525473071325, + "grad_norm": 0.1388893872499466, + "learning_rate": 4.9518015139893675e-05, + "loss": 0.20303285121917725, + "step": 500 + }, + { + "epoch": 0.09188500727802038, + "grad_norm": 0.13239721953868866, + "learning_rate": 4.950352475144427e-05, + "loss": 0.2152268409729004, + "step": 505 + }, + { + "epoch": 0.0927947598253275, + "grad_norm": 0.12834979593753815, + "learning_rate": 4.948882194116119e-05, + "loss": 0.20799248218536376, + "step": 510 + }, + { + "epoch": 0.09370451237263465, + "grad_norm": 0.11886704713106155, + "learning_rate": 4.947390683650354e-05, + "loss": 0.20394976139068605, + "step": 515 + }, + { + "epoch": 0.09461426491994178, + "grad_norm": 0.11398876458406448, + "learning_rate": 4.945877956677083e-05, + "loss": 0.2091092586517334, + "step": 520 + }, + { + "epoch": 0.09552401746724891, + "grad_norm": 0.1422540694475174, + "learning_rate": 4.944344026310186e-05, + "loss": 0.19564238786697388, + "step": 525 + }, + { + "epoch": 0.09643377001455604, + "grad_norm": 0.11359584331512451, + "learning_rate": 4.9427889058473535e-05, + "loss": 0.20493624210357667, + "step": 530 + }, + { + "epoch": 0.09734352256186317, + "grad_norm": 0.11703553050756454, + "learning_rate": 4.941212608769974e-05, + "loss": 0.2098615884780884, + "step": 535 + }, + { + "epoch": 0.0982532751091703, + "grad_norm": 0.14552047848701477, + "learning_rate": 4.939615148743017e-05, + "loss": 0.20382182598114013, + "step": 540 + }, + { + "epoch": 0.09916302765647744, + "grad_norm": 0.13178016245365143, + "learning_rate": 4.937996539614914e-05, + "loss": 0.19901862144470214, + "step": 545 + }, + { + "epoch": 0.10007278020378457, + "grad_norm": 0.635392427444458, + "learning_rate": 4.936356795417439e-05, + "loss": 0.20694944858551026, + "step": 550 + }, + { + "epoch": 0.1009825327510917, + "grad_norm": 0.15019077062606812, + "learning_rate": 4.934695930365586e-05, + "loss": 0.19313746690750122, + "step": 555 + }, + { + "epoch": 0.10189228529839883, + "grad_norm": 0.12941956520080566, + "learning_rate": 4.9330139588574474e-05, + "loss": 0.19671722650527954, + "step": 560 + }, + { + "epoch": 0.10280203784570596, + "grad_norm": 0.13818831741809845, + "learning_rate": 4.931310895474088e-05, + "loss": 0.20026786327362062, + "step": 565 + }, + { + "epoch": 0.1037117903930131, + "grad_norm": 0.12011194974184036, + "learning_rate": 4.929586754979417e-05, + "loss": 0.1932437539100647, + "step": 570 + }, + { + "epoch": 0.10462154294032024, + "grad_norm": 0.1345364898443222, + "learning_rate": 4.9278415523200644e-05, + "loss": 0.20245940685272218, + "step": 575 + }, + { + "epoch": 0.10553129548762737, + "grad_norm": 0.13281017541885376, + "learning_rate": 4.926075302625247e-05, + "loss": 0.19864981174468993, + "step": 580 + }, + { + "epoch": 0.1064410480349345, + "grad_norm": 0.13465586304664612, + "learning_rate": 4.924288021206639e-05, + "loss": 0.19573183059692384, + "step": 585 + }, + { + "epoch": 0.10735080058224163, + "grad_norm": 0.15225961804389954, + "learning_rate": 4.9224797235582396e-05, + "loss": 0.19946801662445068, + "step": 590 + }, + { + "epoch": 0.10826055312954876, + "grad_norm": 0.12816746532917023, + "learning_rate": 4.92065042535624e-05, + "loss": 0.19851526021957397, + "step": 595 + }, + { + "epoch": 0.1091703056768559, + "grad_norm": 0.13802853226661682, + "learning_rate": 4.9188001424588824e-05, + "loss": 0.19321763515472412, + "step": 600 + }, + { + "epoch": 0.11008005822416303, + "grad_norm": 0.17504797875881195, + "learning_rate": 4.9169288909063295e-05, + "loss": 0.2032616138458252, + "step": 605 + }, + { + "epoch": 0.11098981077147016, + "grad_norm": 0.13544194400310516, + "learning_rate": 4.91503668692052e-05, + "loss": 0.2011256456375122, + "step": 610 + }, + { + "epoch": 0.11189956331877729, + "grad_norm": 1.3976134061813354, + "learning_rate": 4.91312354690503e-05, + "loss": 0.19916868209838867, + "step": 615 + }, + { + "epoch": 0.11280931586608442, + "grad_norm": 0.1465059071779251, + "learning_rate": 4.91118948744493e-05, + "loss": 0.19487457275390624, + "step": 620 + }, + { + "epoch": 0.11371906841339156, + "grad_norm": 0.12103168666362762, + "learning_rate": 4.909234525306645e-05, + "loss": 0.1907251238822937, + "step": 625 + }, + { + "epoch": 0.1146288209606987, + "grad_norm": 0.12660574913024902, + "learning_rate": 4.907258677437802e-05, + "loss": 0.19327253103256226, + "step": 630 + }, + { + "epoch": 0.11553857350800582, + "grad_norm": 0.1347813606262207, + "learning_rate": 4.90526196096709e-05, + "loss": 0.19637736082077026, + "step": 635 + }, + { + "epoch": 0.11644832605531295, + "grad_norm": 0.14953652024269104, + "learning_rate": 4.903244393204107e-05, + "loss": 0.20325069427490233, + "step": 640 + }, + { + "epoch": 0.11735807860262008, + "grad_norm": 0.13936272263526917, + "learning_rate": 4.901205991639213e-05, + "loss": 0.1930275321006775, + "step": 645 + }, + { + "epoch": 0.11826783114992721, + "grad_norm": 0.1448420137166977, + "learning_rate": 4.899146773943374e-05, + "loss": 0.20026936531066894, + "step": 650 + }, + { + "epoch": 0.11917758369723436, + "grad_norm": 0.1312534064054489, + "learning_rate": 4.897066757968014e-05, + "loss": 0.19062033891677857, + "step": 655 + }, + { + "epoch": 0.12008733624454149, + "grad_norm": 0.13644742965698242, + "learning_rate": 4.894965961744859e-05, + "loss": 0.18719595670700073, + "step": 660 + }, + { + "epoch": 0.12099708879184862, + "grad_norm": 0.14276087284088135, + "learning_rate": 4.892844403485777e-05, + "loss": 0.19784307479858398, + "step": 665 + }, + { + "epoch": 0.12190684133915575, + "grad_norm": 0.14735399186611176, + "learning_rate": 4.890702101582623e-05, + "loss": 0.19163782596588136, + "step": 670 + }, + { + "epoch": 0.12281659388646288, + "grad_norm": 0.15742065012454987, + "learning_rate": 4.888539074607082e-05, + "loss": 0.19312986135482788, + "step": 675 + }, + { + "epoch": 0.12372634643377002, + "grad_norm": 0.12917031347751617, + "learning_rate": 4.8863553413105025e-05, + "loss": 0.20066320896148682, + "step": 680 + }, + { + "epoch": 0.12463609898107715, + "grad_norm": 0.1484801322221756, + "learning_rate": 4.884150920623737e-05, + "loss": 0.20096096992492676, + "step": 685 + }, + { + "epoch": 0.12554585152838427, + "grad_norm": 0.1455296128988266, + "learning_rate": 4.88192583165698e-05, + "loss": 0.20518505573272705, + "step": 690 + }, + { + "epoch": 0.12645560407569142, + "grad_norm": 0.14517490565776825, + "learning_rate": 4.879680093699598e-05, + "loss": 0.18859238624572755, + "step": 695 + }, + { + "epoch": 0.12736535662299855, + "grad_norm": 0.18778090178966522, + "learning_rate": 4.877413726219964e-05, + "loss": 0.197074818611145, + "step": 700 + }, + { + "epoch": 0.12827510917030568, + "grad_norm": 0.13497677445411682, + "learning_rate": 4.87512674886529e-05, + "loss": 0.18713107109069824, + "step": 705 + }, + { + "epoch": 0.12918486171761281, + "grad_norm": 0.12657155096530914, + "learning_rate": 4.872819181461455e-05, + "loss": 0.1858484387397766, + "step": 710 + }, + { + "epoch": 0.13009461426491994, + "grad_norm": 0.11458148807287216, + "learning_rate": 4.870491044012834e-05, + "loss": 0.18732179403305055, + "step": 715 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.13000249862670898, + "learning_rate": 4.8681423567021244e-05, + "loss": 0.1872936010360718, + "step": 720 + }, + { + "epoch": 0.1319141193595342, + "grad_norm": 0.14580890536308289, + "learning_rate": 4.865773139890172e-05, + "loss": 0.19280019998550416, + "step": 725 + }, + { + "epoch": 0.13282387190684133, + "grad_norm": 0.1507277935743332, + "learning_rate": 4.8633834141157913e-05, + "loss": 0.1898929238319397, + "step": 730 + }, + { + "epoch": 0.13373362445414846, + "grad_norm": 0.1418737769126892, + "learning_rate": 4.860973200095592e-05, + "loss": 0.17926375865936278, + "step": 735 + }, + { + "epoch": 0.1346433770014556, + "grad_norm": 0.17151866853237152, + "learning_rate": 4.858542518723794e-05, + "loss": 0.18963592052459716, + "step": 740 + }, + { + "epoch": 0.13555312954876272, + "grad_norm": 0.11162743717432022, + "learning_rate": 4.8560913910720535e-05, + "loss": 0.19466646909713745, + "step": 745 + }, + { + "epoch": 0.13646288209606988, + "grad_norm": 0.15628376603126526, + "learning_rate": 4.8536198383892725e-05, + "loss": 0.19494034051895143, + "step": 750 + }, + { + "epoch": 0.137372634643377, + "grad_norm": 0.18209289014339447, + "learning_rate": 4.851127882101421e-05, + "loss": 0.18747550249099731, + "step": 755 + }, + { + "epoch": 0.13828238719068414, + "grad_norm": 0.14559614658355713, + "learning_rate": 4.8486155438113454e-05, + "loss": 0.1897158980369568, + "step": 760 + }, + { + "epoch": 0.13919213973799127, + "grad_norm": 0.3198587894439697, + "learning_rate": 4.846082845298586e-05, + "loss": 0.18571001291275024, + "step": 765 + }, + { + "epoch": 0.1401018922852984, + "grad_norm": 0.1486678421497345, + "learning_rate": 4.843529808519189e-05, + "loss": 0.19561930894851684, + "step": 770 + }, + { + "epoch": 0.14101164483260553, + "grad_norm": 0.15318170189857483, + "learning_rate": 4.840956455605509e-05, + "loss": 0.187040114402771, + "step": 775 + }, + { + "epoch": 0.14192139737991266, + "grad_norm": 0.13754244148731232, + "learning_rate": 4.838362808866025e-05, + "loss": 0.18345539569854735, + "step": 780 + }, + { + "epoch": 0.1428311499272198, + "grad_norm": 0.12943248450756073, + "learning_rate": 4.835748890785143e-05, + "loss": 0.1921079397201538, + "step": 785 + }, + { + "epoch": 0.14374090247452692, + "grad_norm": 0.110458143055439, + "learning_rate": 4.833114724023001e-05, + "loss": 0.17927205562591553, + "step": 790 + }, + { + "epoch": 0.14465065502183405, + "grad_norm": 0.2421770840883255, + "learning_rate": 4.830460331415275e-05, + "loss": 0.18317567110061644, + "step": 795 + }, + { + "epoch": 0.14556040756914118, + "grad_norm": 0.14752762019634247, + "learning_rate": 4.8277857359729787e-05, + "loss": 0.1843916058540344, + "step": 800 + }, + { + "epoch": 0.14647016011644834, + "grad_norm": 0.15043556690216064, + "learning_rate": 4.8250909608822644e-05, + "loss": 0.18354393243789674, + "step": 805 + }, + { + "epoch": 0.14737991266375547, + "grad_norm": 0.1381794661283493, + "learning_rate": 4.822376029504223e-05, + "loss": 0.1789781332015991, + "step": 810 + }, + { + "epoch": 0.1482896652110626, + "grad_norm": 0.18386174738407135, + "learning_rate": 4.819640965374681e-05, + "loss": 0.19494292736053467, + "step": 815 + }, + { + "epoch": 0.14919941775836973, + "grad_norm": 0.13829593360424042, + "learning_rate": 4.816885792203996e-05, + "loss": 0.18486063480377196, + "step": 820 + }, + { + "epoch": 0.15010917030567686, + "grad_norm": 0.15033291280269623, + "learning_rate": 4.814110533876852e-05, + "loss": 0.18061509132385253, + "step": 825 + }, + { + "epoch": 0.151018922852984, + "grad_norm": 0.17150473594665527, + "learning_rate": 4.811315214452051e-05, + "loss": 0.18464866876602173, + "step": 830 + }, + { + "epoch": 0.15192867540029112, + "grad_norm": 0.15317125618457794, + "learning_rate": 4.808499858162307e-05, + "loss": 0.1837708592414856, + "step": 835 + }, + { + "epoch": 0.15283842794759825, + "grad_norm": 0.2671392560005188, + "learning_rate": 4.805664489414031e-05, + "loss": 0.19338636398315429, + "step": 840 + }, + { + "epoch": 0.15374818049490538, + "grad_norm": 0.14047028124332428, + "learning_rate": 4.802809132787125e-05, + "loss": 0.17069108486175538, + "step": 845 + }, + { + "epoch": 0.1546579330422125, + "grad_norm": 0.1520431935787201, + "learning_rate": 4.799933813034768e-05, + "loss": 0.18607735633850098, + "step": 850 + }, + { + "epoch": 0.15556768558951964, + "grad_norm": 0.17239463329315186, + "learning_rate": 4.797038555083197e-05, + "loss": 0.18069062232971192, + "step": 855 + }, + { + "epoch": 0.1564774381368268, + "grad_norm": 0.1377955675125122, + "learning_rate": 4.794123384031495e-05, + "loss": 0.18870222568511963, + "step": 860 + }, + { + "epoch": 0.15738719068413393, + "grad_norm": 0.15901461243629456, + "learning_rate": 4.791188325151373e-05, + "loss": 0.18128334283828734, + "step": 865 + }, + { + "epoch": 0.15829694323144106, + "grad_norm": 0.14634132385253906, + "learning_rate": 4.7882334038869495e-05, + "loss": 0.1866163969039917, + "step": 870 + }, + { + "epoch": 0.1592066957787482, + "grad_norm": 0.15361061692237854, + "learning_rate": 4.785258645854529e-05, + "loss": 0.17850807905197144, + "step": 875 + }, + { + "epoch": 0.16011644832605532, + "grad_norm": 0.13751649856567383, + "learning_rate": 4.782264076842385e-05, + "loss": 0.17731113433837892, + "step": 880 + }, + { + "epoch": 0.16102620087336245, + "grad_norm": 0.17909638583660126, + "learning_rate": 4.7792497228105314e-05, + "loss": 0.18344542980194092, + "step": 885 + }, + { + "epoch": 0.16193595342066958, + "grad_norm": 0.16038304567337036, + "learning_rate": 4.776215609890498e-05, + "loss": 0.18868647813796996, + "step": 890 + }, + { + "epoch": 0.1628457059679767, + "grad_norm": 0.1653951108455658, + "learning_rate": 4.773161764385107e-05, + "loss": 0.18614152669906617, + "step": 895 + }, + { + "epoch": 0.16375545851528384, + "grad_norm": 0.16193026304244995, + "learning_rate": 4.770088212768241e-05, + "loss": 0.18564575910568237, + "step": 900 + } + ], + "logging_steps": 5, + "max_steps": 5500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.0362254118877645e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-900/training_args.bin b/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..7fa7678556ad9f674b5958966eca1d9cdec85aba --- /dev/null +++ b/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201 +size 5777 diff --git a/processor_config.json b/processor_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bfaa5e360ad17c2761b7ee3b05abf03e381e8aa1 --- /dev/null +++ b/processor_config.json @@ -0,0 +1,75 @@ +{ + "audio_ms_per_token": 40, + "audio_seq_length": 750, + "feature_extractor": { + "dither": 0.0, + "feature_extractor_type": "Gemma4AudioFeatureExtractor", + "feature_size": 128, + "fft_length": 512, + "fft_overdrive": false, + "frame_length": 320, + "hop_length": 160, + "input_scale_factor": 1.0, + "max_frequency": 8000.0, + "mel_floor": 0.001, + "min_frequency": 0.0, + "padding_side": "left", + "padding_value": 0.0, + "per_bin_mean": null, + "per_bin_stddev": null, + "preemphasis": 0.0, + "preemphasis_htk_flavor": true, + "return_attention_mask": true, + "sampling_rate": 16000 + }, + "image_processor": { + "do_convert_rgb": true, + "do_normalize": false, + "do_rescale": true, + "do_resize": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_processor_type": "Gemma4ImageProcessor", + "image_seq_length": 280, + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 280, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098 + }, + "image_seq_length": 280, + "processor_class": "Gemma4Processor", + "video_processor": { + "do_convert_rgb": true, + "do_normalize": true, + "do_rescale": true, + "do_resize": true, + "do_sample_frames": true, + "image_mean": [ + 0.0, + 0.0, + 0.0 + ], + "image_std": [ + 1.0, + 1.0, + 1.0 + ], + "max_soft_tokens": 70, + "num_frames": 32, + "patch_size": 16, + "pooling_kernel_size": 3, + "resample": 3, + "rescale_factor": 0.00392156862745098, + "return_metadata": false, + "video_processor_type": "Gemma4VideoProcessor" + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1ff9f3e3439a939b971f9919e821bf87e835a503 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f +size 32169626 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4a2534f512673ae766c336141ae8557b6917c049 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,289 @@ +{ + "audio_token": "<|audio|>", + "backend": "tokenizers", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "bos_token": "", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eos_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "extra_special_tokens": [ + "<|video|>" + ], + "image_token": "<|image|>", + "is_local": false, + "mask_token": "", + "model_max_length": 131072, + "model_specific_special_tokens": { + "audio_token": "<|audio|>", + "boa_token": "<|audio>", + "boi_token": "<|image>", + "eoa_token": "", + "eoc_token": "", + "eoi_token": "", + "eot_token": "", + "escape_token": "<|\"|>", + "etc_token": "", + "etd_token": "", + "etr_token": "", + "image_token": "<|image|>", + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>" + }, + "pad_token": "", + "padding_side": "right", + "processor_class": "Gemma4Processor", + "response_schema": { + "properties": { + "content": { + "type": "string" + }, + "role": { + "const": "assistant" + }, + "thinking": { + "type": "string" + }, + "tool_calls": { + "items": { + "properties": { + "function": { + "properties": { + "arguments": { + "additionalProperties": {}, + "type": "object", + "x-parser": "gemma4-tool-call" + }, + "name": { + "type": "string" + } + }, + "type": "object", + "x-regex": "call\\:(?P\\w+)(?P\\{.*\\})" + }, + "type": { + "const": "function" + } + }, + "type": "object" + }, + "type": "array", + "x-regex-iterator": "<\\|tool_call>(.*?)" + } + }, + "type": "object", + "x-regex": "(\\<\\|channel\\>thought\\n(?P.*?)\\)?(?P\\<\\|tool_call\\>.*\\)?(?P(?:(?!\\)(?!\\<\\|tool_response\\>).)+)?(?:\\|\\<\\|tool_response\\>)?" + }, + "soc_token": "<|channel>", + "sot_token": "<|turn>", + "stc_token": "<|tool_call>", + "std_token": "<|tool>", + "str_token": "<|tool_response>", + "think_token": "<|think|>", + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "added_tokens_decoder": { + "0": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "1": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "2": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "3": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "4": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "46": { + "content": "<|tool>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "47": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "48": { + "content": "<|tool_call>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "49": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "50": { + "content": "<|tool_response>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "51": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "52": { + "content": "<|\"|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "98": { + "content": "<|think|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "100": { + "content": "<|channel>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "101": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "105": { + "content": "<|turn>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "106": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "255999": { + "content": "<|image>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "256000": { + "content": "<|audio>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258880": { + "content": "<|image|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258881": { + "content": "<|audio|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258882": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258883": { + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + "258884": { + "content": "<|video|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + } +}