Sathvik0101 commited on
Commit
136ad6a
·
verified ·
1 Parent(s): e676cb5

Upload cyber-duel-tiny LoRA adapter (SFT)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-270m-it
3
+ library_name: peft
4
+ model_name: sft
5
+ tags:
6
+ - base_model:adapter:google/gemma-3-270m-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ licence: license
12
+ pipeline_tag: text-generation
13
+ ---
14
+
15
+ # Model Card for sft
16
+
17
+ This model is a fine-tuned version of [google/gemma-3-270m-it](https://huggingface.co/google/gemma-3-270m-it).
18
+ It has been trained using [TRL](https://github.com/huggingface/trl).
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from transformers import pipeline
24
+
25
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
26
+ generator = pipeline("text-generation", model="None", device="cuda")
27
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
28
+ print(output["generated_text"])
29
+ ```
30
+
31
+ ## Training procedure
32
+
33
+
34
+
35
+
36
+
37
+ This model was trained with SFT.
38
+
39
+ ### Framework versions
40
+
41
+ - PEFT 0.19.1
42
+ - TRL: 0.29.1
43
+ - Transformers: 5.12.0
44
+ - Pytorch: 2.12.0
45
+ - Datasets: 4.3.0
46
+ - Tokenizers: 0.22.2
47
+
48
+ ## Citations
49
+
50
+
51
+
52
+ Cite TRL as:
53
+
54
+ ```bibtex
55
+ @software{vonwerra2020trl,
56
+ title = {{TRL: Transformers Reinforcement Learning}},
57
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
58
+ license = {Apache-2.0},
59
+ url = {https://github.com/huggingface/trl},
60
+ year = {2020}
61
+ }
62
+ ```
adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-270m-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 16,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "gate_proj",
34
+ "k_proj",
35
+ "q_proj",
36
+ "v_proj",
37
+ "up_proj",
38
+ "down_proj",
39
+ "o_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1435aaec17f56e3fd3d82fd66514ac8b1261af720fe0159a80b04fa37617ee55
3
+ size 15220968
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-4500/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-270m-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-3-270m-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
checkpoint-4500/adapter_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-270m-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 16,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": [
33
+ "gate_proj",
34
+ "k_proj",
35
+ "q_proj",
36
+ "v_proj",
37
+ "up_proj",
38
+ "down_proj",
39
+ "o_proj"
40
+ ],
41
+ "target_parameters": null,
42
+ "task_type": "CAUSAL_LM",
43
+ "trainable_token_indices": null,
44
+ "use_bdlora": null,
45
+ "use_dora": false,
46
+ "use_qalora": false,
47
+ "use_rslora": false
48
+ }
checkpoint-4500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1435aaec17f56e3fd3d82fd66514ac8b1261af720fe0159a80b04fa37617ee55
3
+ size 15220968
checkpoint-4500/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-4500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e55e133ec1d8081e2245bd439747f4949c2d151e5dfe0e3e1f1b864997d7c321
3
+ size 30587787
checkpoint-4500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70377365f027742819b59fa5cdfb0b0d71dfb3ffad90b7e9baf531c1cd51f84b
3
+ size 14581
checkpoint-4500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e44246e9e33d2d872a933830bc5eeb592ee23a7ca2e79f1ce162a0086ef17a04
3
+ size 1465
checkpoint-4500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daab2354f8a74e70d70b4d1f804939b68a8c9624dd06cb7858e52dd8970e9726
3
+ size 33384567
checkpoint-4500/tokenizer_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<eos>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": false,
10
+ "local_files_only": false,
11
+ "mask_token": "<mask>",
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "model_specific_special_tokens": {
14
+ "boi_token": "<start_of_image>",
15
+ "eoi_token": "<end_of_image>",
16
+ "image_token": "<image_soft_token>"
17
+ },
18
+ "pad_token": "<pad>",
19
+ "padding_side": "left",
20
+ "sp_model_kwargs": null,
21
+ "spaces_between_special_tokens": false,
22
+ "tokenizer_class": "GemmaTokenizer",
23
+ "unk_token": "<unk>",
24
+ "use_default_system_prompt": false
25
+ }
checkpoint-4500/trainer_state.json ADDED
@@ -0,0 +1,2284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.548675110936165,
14
+ "epoch": 0.013333333333333334,
15
+ "grad_norm": 6.004289150238037,
16
+ "learning_rate": 1.688888888888889e-05,
17
+ "loss": 3.992266082763672,
18
+ "mean_token_accuracy": 0.4231670804321766,
19
+ "num_tokens": 102186.0,
20
+ "step": 20
21
+ },
22
+ {
23
+ "entropy": 2.2785673171281813,
24
+ "epoch": 0.02666666666666667,
25
+ "grad_norm": 3.0929577350616455,
26
+ "learning_rate": 3.466666666666667e-05,
27
+ "loss": 2.8258544921875,
28
+ "mean_token_accuracy": 0.5193272314965725,
29
+ "num_tokens": 204333.0,
30
+ "step": 40
31
+ },
32
+ {
33
+ "entropy": 1.6070524707436562,
34
+ "epoch": 0.04,
35
+ "grad_norm": 2.344675302505493,
36
+ "learning_rate": 5.244444444444445e-05,
37
+ "loss": 1.4720239639282227,
38
+ "mean_token_accuracy": 0.7221725225448609,
39
+ "num_tokens": 306198.0,
40
+ "step": 60
41
+ },
42
+ {
43
+ "entropy": 0.4960472501814365,
44
+ "epoch": 0.05333333333333334,
45
+ "grad_norm": 2.080559730529785,
46
+ "learning_rate": 7.022222222222222e-05,
47
+ "loss": 0.4806540012359619,
48
+ "mean_token_accuracy": 0.9012673273682594,
49
+ "num_tokens": 408035.0,
50
+ "step": 80
51
+ },
52
+ {
53
+ "entropy": 0.1692034611478448,
54
+ "epoch": 0.06666666666666667,
55
+ "grad_norm": 1.5278408527374268,
56
+ "learning_rate": 8.800000000000001e-05,
57
+ "loss": 0.1592921018600464,
58
+ "mean_token_accuracy": 0.9599622413516045,
59
+ "num_tokens": 509962.0,
60
+ "step": 100
61
+ },
62
+ {
63
+ "entropy": 0.11861470770090818,
64
+ "epoch": 0.08,
65
+ "grad_norm": 0.999698281288147,
66
+ "learning_rate": 0.00010577777777777777,
67
+ "loss": 0.11052950620651245,
68
+ "mean_token_accuracy": 0.9685859054327011,
69
+ "num_tokens": 611562.0,
70
+ "step": 120
71
+ },
72
+ {
73
+ "entropy": 0.1036016432568431,
74
+ "epoch": 0.09333333333333334,
75
+ "grad_norm": 0.9115886092185974,
76
+ "learning_rate": 0.00012355555555555557,
77
+ "loss": 0.0914052426815033,
78
+ "mean_token_accuracy": 0.9704682394862175,
79
+ "num_tokens": 713684.0,
80
+ "step": 140
81
+ },
82
+ {
83
+ "entropy": 0.09701150320470334,
84
+ "epoch": 0.10666666666666667,
85
+ "grad_norm": 0.6500758528709412,
86
+ "learning_rate": 0.00014133333333333334,
87
+ "loss": 0.08168401718139648,
88
+ "mean_token_accuracy": 0.9728480890393257,
89
+ "num_tokens": 815728.0,
90
+ "step": 160
91
+ },
92
+ {
93
+ "entropy": 0.0902867017313838,
94
+ "epoch": 0.12,
95
+ "grad_norm": 0.4816068112850189,
96
+ "learning_rate": 0.00015911111111111112,
97
+ "loss": 0.0673690140247345,
98
+ "mean_token_accuracy": 0.9743824899196625,
99
+ "num_tokens": 917588.0,
100
+ "step": 180
101
+ },
102
+ {
103
+ "entropy": 0.07180177625268698,
104
+ "epoch": 0.13333333333333333,
105
+ "grad_norm": 0.42101994156837463,
106
+ "learning_rate": 0.0001768888888888889,
107
+ "loss": 0.05838126540184021,
108
+ "mean_token_accuracy": 0.975058288872242,
109
+ "num_tokens": 1020041.0,
110
+ "step": 200
111
+ },
112
+ {
113
+ "entropy": 0.06279958104714752,
114
+ "epoch": 0.14666666666666667,
115
+ "grad_norm": 0.41531553864479065,
116
+ "learning_rate": 0.0001946666666666667,
117
+ "loss": 0.05505728721618652,
118
+ "mean_token_accuracy": 0.976372754573822,
119
+ "num_tokens": 1121933.0,
120
+ "step": 220
121
+ },
122
+ {
123
+ "entropy": 0.05904992977157235,
124
+ "epoch": 0.16,
125
+ "grad_norm": 0.5666757822036743,
126
+ "learning_rate": 0.00019999470763544457,
127
+ "loss": 0.052491378784179685,
128
+ "mean_token_accuracy": 0.9762984499335289,
129
+ "num_tokens": 1223670.0,
130
+ "step": 240
131
+ },
132
+ {
133
+ "entropy": 0.05695097530260682,
134
+ "epoch": 0.17333333333333334,
135
+ "grad_norm": 0.39107683300971985,
136
+ "learning_rate": 0.00019996878719840213,
137
+ "loss": 0.05221613645553589,
138
+ "mean_token_accuracy": 0.9769444420933724,
139
+ "num_tokens": 1325903.0,
140
+ "step": 260
141
+ },
142
+ {
143
+ "entropy": 0.05454709641635418,
144
+ "epoch": 0.18666666666666668,
145
+ "grad_norm": 0.2881831228733063,
146
+ "learning_rate": 0.00019992127221406275,
147
+ "loss": 0.05105168223381042,
148
+ "mean_token_accuracy": 0.9766697883605957,
149
+ "num_tokens": 1427883.0,
150
+ "step": 280
151
+ },
152
+ {
153
+ "entropy": 0.05568597661331296,
154
+ "epoch": 0.2,
155
+ "grad_norm": 0.2969810962677002,
156
+ "learning_rate": 0.00019985217294627577,
157
+ "loss": 0.05190561413764953,
158
+ "mean_token_accuracy": 0.9768449172377587,
159
+ "num_tokens": 1529850.0,
160
+ "step": 300
161
+ },
162
+ {
163
+ "entropy": 0.05605392120778561,
164
+ "epoch": 0.21333333333333335,
165
+ "grad_norm": 0.39327648282051086,
166
+ "learning_rate": 0.00019976150432137423,
167
+ "loss": 0.05125090479850769,
168
+ "mean_token_accuracy": 0.9767352715134621,
169
+ "num_tokens": 1631796.0,
170
+ "step": 320
171
+ },
172
+ {
173
+ "entropy": 0.05631188191473484,
174
+ "epoch": 0.22666666666666666,
175
+ "grad_norm": 0.2569703757762909,
176
+ "learning_rate": 0.00019964928592495045,
177
+ "loss": 0.05136184692382813,
178
+ "mean_token_accuracy": 0.9767047330737114,
179
+ "num_tokens": 1733431.0,
180
+ "step": 340
181
+ },
182
+ {
183
+ "entropy": 0.054749509692192076,
184
+ "epoch": 0.24,
185
+ "grad_norm": 0.2503352761268616,
186
+ "learning_rate": 0.00019951554199762526,
187
+ "loss": 0.04927194118499756,
188
+ "mean_token_accuracy": 0.9772127717733383,
189
+ "num_tokens": 1835736.0,
190
+ "step": 360
191
+ },
192
+ {
193
+ "entropy": 0.053956403583288196,
194
+ "epoch": 0.25333333333333335,
195
+ "grad_norm": 0.26568838953971863,
196
+ "learning_rate": 0.00019936030142981182,
197
+ "loss": 0.04831983149051666,
198
+ "mean_token_accuracy": 0.9772727772593498,
199
+ "num_tokens": 1937395.0,
200
+ "step": 380
201
+ },
202
+ {
203
+ "entropy": 0.05297513753175735,
204
+ "epoch": 0.26666666666666666,
205
+ "grad_norm": 0.21782436966896057,
206
+ "learning_rate": 0.00019918359775547489,
207
+ "loss": 0.048703563213348386,
208
+ "mean_token_accuracy": 0.9776117220520973,
209
+ "num_tokens": 2039661.0,
210
+ "step": 400
211
+ },
212
+ {
213
+ "entropy": 0.05235615810379386,
214
+ "epoch": 0.28,
215
+ "grad_norm": 0.2456953078508377,
216
+ "learning_rate": 0.00019898546914488697,
217
+ "loss": 0.04742903709411621,
218
+ "mean_token_accuracy": 0.9779680415987968,
219
+ "num_tokens": 2141312.0,
220
+ "step": 420
221
+ },
222
+ {
223
+ "entropy": 0.05012538954615593,
224
+ "epoch": 0.29333333333333333,
225
+ "grad_norm": 0.17193332314491272,
226
+ "learning_rate": 0.00019876595839638314,
227
+ "loss": 0.04511936604976654,
228
+ "mean_token_accuracy": 0.978802102804184,
229
+ "num_tokens": 2243220.0,
230
+ "step": 440
231
+ },
232
+ {
233
+ "entropy": 0.050425101164728404,
234
+ "epoch": 0.30666666666666664,
235
+ "grad_norm": 0.19117344915866852,
236
+ "learning_rate": 0.00019852511292711608,
237
+ "loss": 0.04454375207424164,
238
+ "mean_token_accuracy": 0.9793910697102547,
239
+ "num_tokens": 2345110.0,
240
+ "step": 460
241
+ },
242
+ {
243
+ "entropy": 0.0502777012065053,
244
+ "epoch": 0.32,
245
+ "grad_norm": 0.1484805941581726,
246
+ "learning_rate": 0.0001982629847628132,
247
+ "loss": 0.045093965530395505,
248
+ "mean_token_accuracy": 0.9782336875796318,
249
+ "num_tokens": 2446814.0,
250
+ "step": 480
251
+ },
252
+ {
253
+ "entropy": 0.04916129466146231,
254
+ "epoch": 0.3333333333333333,
255
+ "grad_norm": 0.17659035325050354,
256
+ "learning_rate": 0.0001979796305265386,
257
+ "loss": 0.04536721706390381,
258
+ "mean_token_accuracy": 0.9788262486457825,
259
+ "num_tokens": 2548699.0,
260
+ "step": 500
261
+ },
262
+ {
263
+ "entropy": 0.04801498837769032,
264
+ "epoch": 0.3466666666666667,
265
+ "grad_norm": 0.18467392027378082,
266
+ "learning_rate": 0.0001976751114264616,
267
+ "loss": 0.04428495168685913,
268
+ "mean_token_accuracy": 0.9791656643152237,
269
+ "num_tokens": 2650925.0,
270
+ "step": 520
271
+ },
272
+ {
273
+ "entropy": 0.04973381711170077,
274
+ "epoch": 0.36,
275
+ "grad_norm": 0.22871969640254974,
276
+ "learning_rate": 0.0001973494932426351,
277
+ "loss": 0.04659122526645661,
278
+ "mean_token_accuracy": 0.9777900949120522,
279
+ "num_tokens": 2753152.0,
280
+ "step": 540
281
+ },
282
+ {
283
+ "entropy": 0.050069388933479786,
284
+ "epoch": 0.37333333333333335,
285
+ "grad_norm": 0.14215655624866486,
286
+ "learning_rate": 0.00019700284631278623,
287
+ "loss": 0.04543479979038238,
288
+ "mean_token_accuracy": 0.9784642964601517,
289
+ "num_tokens": 2855157.0,
290
+ "step": 560
291
+ },
292
+ {
293
+ "entropy": 0.048892225697636606,
294
+ "epoch": 0.38666666666666666,
295
+ "grad_norm": 0.14485321938991547,
296
+ "learning_rate": 0.00019663524551712236,
297
+ "loss": 0.043998023867607115,
298
+ "mean_token_accuracy": 0.9789358124136924,
299
+ "num_tokens": 2957430.0,
300
+ "step": 580
301
+ },
302
+ {
303
+ "entropy": 0.049546369817107916,
304
+ "epoch": 0.4,
305
+ "grad_norm": 0.1522541642189026,
306
+ "learning_rate": 0.0001962467702621562,
307
+ "loss": 0.04526585042476654,
308
+ "mean_token_accuracy": 0.9789461970329285,
309
+ "num_tokens": 3059857.0,
310
+ "step": 600
311
+ },
312
+ {
313
+ "entropy": 0.048749705869704486,
314
+ "epoch": 0.41333333333333333,
315
+ "grad_norm": 0.14776450395584106,
316
+ "learning_rate": 0.00019583750446355286,
317
+ "loss": 0.04488187730312347,
318
+ "mean_token_accuracy": 0.9790951684117317,
319
+ "num_tokens": 3161377.0,
320
+ "step": 620
321
+ },
322
+ {
323
+ "entropy": 0.04819442732259631,
324
+ "epoch": 0.4266666666666667,
325
+ "grad_norm": 0.155587837100029,
326
+ "learning_rate": 0.000195407536528003,
327
+ "loss": 0.04454294443130493,
328
+ "mean_token_accuracy": 0.9792696803808212,
329
+ "num_tokens": 3263597.0,
330
+ "step": 640
331
+ },
332
+ {
333
+ "entropy": 0.048739112261682746,
334
+ "epoch": 0.44,
335
+ "grad_norm": 0.24131548404693604,
336
+ "learning_rate": 0.0001949569593341258,
337
+ "loss": 0.04449517726898193,
338
+ "mean_token_accuracy": 0.9789462149143219,
339
+ "num_tokens": 3365773.0,
340
+ "step": 660
341
+ },
342
+ {
343
+ "entropy": 0.04729501772671938,
344
+ "epoch": 0.4533333333333333,
345
+ "grad_norm": 0.16851578652858734,
346
+ "learning_rate": 0.00019448587021240611,
347
+ "loss": 0.0436316579580307,
348
+ "mean_token_accuracy": 0.9790461182594299,
349
+ "num_tokens": 3467719.0,
350
+ "step": 680
351
+ },
352
+ {
353
+ "entropy": 0.048864346370100974,
354
+ "epoch": 0.4666666666666667,
355
+ "grad_norm": 0.17274609208106995,
356
+ "learning_rate": 0.00019399437092416967,
357
+ "loss": 0.04535620212554932,
358
+ "mean_token_accuracy": 0.9788791447877884,
359
+ "num_tokens": 3569559.0,
360
+ "step": 700
361
+ },
362
+ {
363
+ "entropy": 0.04898029724135995,
364
+ "epoch": 0.48,
365
+ "grad_norm": 0.13499416410923004,
366
+ "learning_rate": 0.00019348256763960145,
367
+ "loss": 0.045434945821762086,
368
+ "mean_token_accuracy": 0.9788094267249108,
369
+ "num_tokens": 3671491.0,
370
+ "step": 720
371
+ },
372
+ {
373
+ "entropy": 0.04580554729327559,
374
+ "epoch": 0.49333333333333335,
375
+ "grad_norm": 0.12506447732448578,
376
+ "learning_rate": 0.00019295057091481147,
377
+ "loss": 0.04356709420681,
378
+ "mean_token_accuracy": 0.9791021943092346,
379
+ "num_tokens": 3773051.0,
380
+ "step": 740
381
+ },
382
+ {
383
+ "entropy": 0.047521025873720646,
384
+ "epoch": 0.5066666666666667,
385
+ "grad_norm": 0.121482253074646,
386
+ "learning_rate": 0.00019239849566795323,
387
+ "loss": 0.044592976570129395,
388
+ "mean_token_accuracy": 0.9786569505929947,
389
+ "num_tokens": 3875663.0,
390
+ "step": 760
391
+ },
392
+ {
393
+ "entropy": 0.045532725658267735,
394
+ "epoch": 0.52,
395
+ "grad_norm": 0.13711974024772644,
396
+ "learning_rate": 0.00019182646115439996,
397
+ "loss": 0.042892631888389585,
398
+ "mean_token_accuracy": 0.979731023311615,
399
+ "num_tokens": 3977742.0,
400
+ "step": 780
401
+ },
402
+ {
403
+ "entropy": 0.04748789621517062,
404
+ "epoch": 0.5333333333333333,
405
+ "grad_norm": 0.126457080245018,
406
+ "learning_rate": 0.00019123459094098398,
407
+ "loss": 0.04508825838565826,
408
+ "mean_token_accuracy": 0.9783048242330551,
409
+ "num_tokens": 4079943.0,
410
+ "step": 800
411
+ },
412
+ {
413
+ "entropy": 0.045889181550592184,
414
+ "epoch": 0.5466666666666666,
415
+ "grad_norm": 0.12796172499656677,
416
+ "learning_rate": 0.00019062301287930446,
417
+ "loss": 0.04326332211494446,
418
+ "mean_token_accuracy": 0.979296863079071,
419
+ "num_tokens": 4181963.0,
420
+ "step": 820
421
+ },
422
+ {
423
+ "entropy": 0.045128315966576335,
424
+ "epoch": 0.56,
425
+ "grad_norm": 0.0813562199473381,
426
+ "learning_rate": 0.00018999185907811009,
427
+ "loss": 0.04314403533935547,
428
+ "mean_token_accuracy": 0.9794226452708245,
429
+ "num_tokens": 4283940.0,
430
+ "step": 840
431
+ },
432
+ {
433
+ "entropy": 0.04633188545703888,
434
+ "epoch": 0.5733333333333334,
435
+ "grad_norm": 0.13212576508522034,
436
+ "learning_rate": 0.00018934126587476162,
437
+ "loss": 0.04438722729682922,
438
+ "mean_token_accuracy": 0.9792284339666366,
439
+ "num_tokens": 4386033.0,
440
+ "step": 860
441
+ },
442
+ {
443
+ "entropy": 0.046954588033258915,
444
+ "epoch": 0.5866666666666667,
445
+ "grad_norm": 0.24543477594852448,
446
+ "learning_rate": 0.0001886713738057815,
447
+ "loss": 0.04496486783027649,
448
+ "mean_token_accuracy": 0.978602097928524,
449
+ "num_tokens": 4488033.0,
450
+ "step": 880
451
+ },
452
+ {
453
+ "entropy": 0.047627194225788115,
454
+ "epoch": 0.6,
455
+ "grad_norm": 0.15973004698753357,
456
+ "learning_rate": 0.000187982327576496,
457
+ "loss": 0.0447381466627121,
458
+ "mean_token_accuracy": 0.978855662047863,
459
+ "num_tokens": 4590393.0,
460
+ "step": 900
461
+ },
462
+ {
463
+ "entropy": 0.049009975790977475,
464
+ "epoch": 0.6133333333333333,
465
+ "grad_norm": 0.4588961899280548,
466
+ "learning_rate": 0.000187274276029777,
467
+ "loss": 0.04679847955703735,
468
+ "mean_token_accuracy": 0.9788309365510941,
469
+ "num_tokens": 4692314.0,
470
+ "step": 920
471
+ },
472
+ {
473
+ "entropy": 0.05283641302958131,
474
+ "epoch": 0.6266666666666667,
475
+ "grad_norm": 0.17900370061397552,
476
+ "learning_rate": 0.00018654737211389004,
477
+ "loss": 0.04886095821857452,
478
+ "mean_token_accuracy": 0.9779917612671852,
479
+ "num_tokens": 4794297.0,
480
+ "step": 940
481
+ },
482
+ {
483
+ "entropy": 0.05194324087351561,
484
+ "epoch": 0.64,
485
+ "grad_norm": 0.2685967683792114,
486
+ "learning_rate": 0.00018580177284945566,
487
+ "loss": 0.04925000071525574,
488
+ "mean_token_accuracy": 0.9787736907601357,
489
+ "num_tokens": 4896719.0,
490
+ "step": 960
491
+ },
492
+ {
493
+ "entropy": 0.04687528889626265,
494
+ "epoch": 0.6533333333333333,
495
+ "grad_norm": 0.3776164948940277,
496
+ "learning_rate": 0.0001850376392955307,
497
+ "loss": 0.04358056485652924,
498
+ "mean_token_accuracy": 0.9792398914694787,
499
+ "num_tokens": 4998801.0,
500
+ "step": 980
501
+ },
502
+ {
503
+ "entropy": 0.04969303589314222,
504
+ "epoch": 0.6666666666666666,
505
+ "grad_norm": 0.10363394021987915,
506
+ "learning_rate": 0.00018425513651481747,
507
+ "loss": 0.04642247259616852,
508
+ "mean_token_accuracy": 0.9783516511321068,
509
+ "num_tokens": 5100997.0,
510
+ "step": 1000
511
+ },
512
+ {
513
+ "entropy": 0.047921424824744464,
514
+ "epoch": 0.68,
515
+ "grad_norm": 0.1332525759935379,
516
+ "learning_rate": 0.00018345443353800839,
517
+ "loss": 0.04439827501773834,
518
+ "mean_token_accuracy": 0.9791212469339371,
519
+ "num_tokens": 5202682.0,
520
+ "step": 1020
521
+ },
522
+ {
523
+ "entropy": 0.047575213573873044,
524
+ "epoch": 0.6933333333333334,
525
+ "grad_norm": 0.08405883610248566,
526
+ "learning_rate": 0.00018263570332727275,
527
+ "loss": 0.043652302026748656,
528
+ "mean_token_accuracy": 0.9786113709211349,
529
+ "num_tokens": 5304249.0,
530
+ "step": 1040
531
+ },
532
+ {
533
+ "entropy": 0.04774442110210657,
534
+ "epoch": 0.7066666666666667,
535
+ "grad_norm": 0.09579049050807953,
536
+ "learning_rate": 0.00018179912273889501,
537
+ "loss": 0.043841779232025146,
538
+ "mean_token_accuracy": 0.9791841998696327,
539
+ "num_tokens": 5406457.0,
540
+ "step": 1060
541
+ },
542
+ {
543
+ "entropy": 0.04760089740157127,
544
+ "epoch": 0.72,
545
+ "grad_norm": 0.13812078535556793,
546
+ "learning_rate": 0.00018094487248507127,
547
+ "loss": 0.04469398260116577,
548
+ "mean_token_accuracy": 0.9787818253040313,
549
+ "num_tokens": 5508325.0,
550
+ "step": 1080
551
+ },
552
+ {
553
+ "entropy": 0.04628140116110444,
554
+ "epoch": 0.7333333333333333,
555
+ "grad_norm": 0.09030942618846893,
556
+ "learning_rate": 0.00018007313709487334,
557
+ "loss": 0.043077632784843445,
558
+ "mean_token_accuracy": 0.9798856094479561,
559
+ "num_tokens": 5609876.0,
560
+ "step": 1100
561
+ },
562
+ {
563
+ "entropy": 0.04589016325771809,
564
+ "epoch": 0.7466666666666667,
565
+ "grad_norm": 0.0854763314127922,
566
+ "learning_rate": 0.00017918410487438805,
567
+ "loss": 0.04384036958217621,
568
+ "mean_token_accuracy": 0.9791762813925743,
569
+ "num_tokens": 5712340.0,
570
+ "step": 1120
571
+ },
572
+ {
573
+ "entropy": 0.04689710335806012,
574
+ "epoch": 0.76,
575
+ "grad_norm": 0.10074414312839508,
576
+ "learning_rate": 0.00017827796786604042,
577
+ "loss": 0.04416438341140747,
578
+ "mean_token_accuracy": 0.979088181257248,
579
+ "num_tokens": 5814598.0,
580
+ "step": 1140
581
+ },
582
+ {
583
+ "entropy": 0.04654768798500299,
584
+ "epoch": 0.7733333333333333,
585
+ "grad_norm": 0.07522693276405334,
586
+ "learning_rate": 0.0001773549218071105,
587
+ "loss": 0.0432561069726944,
588
+ "mean_token_accuracy": 0.9793283045291901,
589
+ "num_tokens": 5916277.0,
590
+ "step": 1160
591
+ },
592
+ {
593
+ "entropy": 0.0449189274571836,
594
+ "epoch": 0.7866666666666666,
595
+ "grad_norm": 0.12037090212106705,
596
+ "learning_rate": 0.00017641516608745114,
597
+ "loss": 0.04267836213111877,
598
+ "mean_token_accuracy": 0.9796097055077553,
599
+ "num_tokens": 6018305.0,
600
+ "step": 1180
601
+ },
602
+ {
603
+ "entropy": 0.04518893817439675,
604
+ "epoch": 0.8,
605
+ "grad_norm": 0.15295696258544922,
606
+ "learning_rate": 0.0001754589037064175,
607
+ "loss": 0.04324706792831421,
608
+ "mean_token_accuracy": 0.9793181642889977,
609
+ "num_tokens": 6120161.0,
610
+ "step": 1200
611
+ },
612
+ {
613
+ "entropy": 0.0459614584222436,
614
+ "epoch": 0.8133333333333334,
615
+ "grad_norm": 0.10844975709915161,
616
+ "learning_rate": 0.0001744863412290165,
617
+ "loss": 0.04338730275630951,
618
+ "mean_token_accuracy": 0.9787795886397361,
619
+ "num_tokens": 6221926.0,
620
+ "step": 1220
621
+ },
622
+ {
623
+ "entropy": 0.04700327459722757,
624
+ "epoch": 0.8266666666666667,
625
+ "grad_norm": 0.12464659661054611,
626
+ "learning_rate": 0.00017349768874128603,
627
+ "loss": 0.04424178600311279,
628
+ "mean_token_accuracy": 0.9791146576404571,
629
+ "num_tokens": 6323994.0,
630
+ "step": 1240
631
+ },
632
+ {
633
+ "entropy": 0.045251396391540764,
634
+ "epoch": 0.84,
635
+ "grad_norm": 0.10585556924343109,
636
+ "learning_rate": 0.00017249315980491373,
637
+ "loss": 0.04233089089393616,
638
+ "mean_token_accuracy": 0.980115057528019,
639
+ "num_tokens": 6425801.0,
640
+ "step": 1260
641
+ },
642
+ {
643
+ "entropy": 0.04711138280108571,
644
+ "epoch": 0.8533333333333334,
645
+ "grad_norm": 0.10078904032707214,
646
+ "learning_rate": 0.0001714729714111049,
647
+ "loss": 0.043426957726478574,
648
+ "mean_token_accuracy": 0.9791831955313682,
649
+ "num_tokens": 6527510.0,
650
+ "step": 1280
651
+ },
652
+ {
653
+ "entropy": 0.04563735323026776,
654
+ "epoch": 0.8666666666666667,
655
+ "grad_norm": 0.10202273726463318,
656
+ "learning_rate": 0.00017043734393370965,
657
+ "loss": 0.043241679668426514,
658
+ "mean_token_accuracy": 0.9791531518101693,
659
+ "num_tokens": 6630052.0,
660
+ "step": 1300
661
+ },
662
+ {
663
+ "entropy": 0.04624767201021314,
664
+ "epoch": 0.88,
665
+ "grad_norm": 0.1017850786447525,
666
+ "learning_rate": 0.0001693865010816192,
667
+ "loss": 0.043641078472137454,
668
+ "mean_token_accuracy": 0.9791532784700394,
669
+ "num_tokens": 6732187.0,
670
+ "step": 1320
671
+ },
672
+ {
673
+ "entropy": 0.04555416237562895,
674
+ "epoch": 0.8933333333333333,
675
+ "grad_norm": 0.0906793549656868,
676
+ "learning_rate": 0.00016832066985044195,
677
+ "loss": 0.04301130175590515,
678
+ "mean_token_accuracy": 0.9790184095501899,
679
+ "num_tokens": 6834270.0,
680
+ "step": 1340
681
+ },
682
+ {
683
+ "entropy": 0.044891719426959756,
684
+ "epoch": 0.9066666666666666,
685
+ "grad_norm": 0.06667148321866989,
686
+ "learning_rate": 0.00016724008047346947,
687
+ "loss": 0.04192114770412445,
688
+ "mean_token_accuracy": 0.9799642145633698,
689
+ "num_tokens": 6936310.0,
690
+ "step": 1360
691
+ },
692
+ {
693
+ "entropy": 0.04586669374257326,
694
+ "epoch": 0.92,
695
+ "grad_norm": 0.12085918337106705,
696
+ "learning_rate": 0.0001661449663719432,
697
+ "loss": 0.04404585361480713,
698
+ "mean_token_accuracy": 0.9786775410175323,
699
+ "num_tokens": 7037928.0,
700
+ "step": 1380
701
+ },
702
+ {
703
+ "entropy": 0.04691507248207927,
704
+ "epoch": 0.9333333333333333,
705
+ "grad_norm": 0.09447435289621353,
706
+ "learning_rate": 0.00016503556410463234,
707
+ "loss": 0.04427667260169983,
708
+ "mean_token_accuracy": 0.9788988634943963,
709
+ "num_tokens": 7139966.0,
710
+ "step": 1400
711
+ },
712
+ {
713
+ "entropy": 0.04686050089076162,
714
+ "epoch": 0.9466666666666667,
715
+ "grad_norm": 0.07748451828956604,
716
+ "learning_rate": 0.0001639121133167342,
717
+ "loss": 0.043699628114700316,
718
+ "mean_token_accuracy": 0.9789900943636894,
719
+ "num_tokens": 7242243.0,
720
+ "step": 1420
721
+ },
722
+ {
723
+ "entropy": 0.04621442370116711,
724
+ "epoch": 0.96,
725
+ "grad_norm": 0.0875391811132431,
726
+ "learning_rate": 0.0001627748566881077,
727
+ "loss": 0.0435163140296936,
728
+ "mean_token_accuracy": 0.9793973177671432,
729
+ "num_tokens": 7344333.0,
730
+ "step": 1440
731
+ },
732
+ {
733
+ "entropy": 0.04617999196052551,
734
+ "epoch": 0.9733333333333334,
735
+ "grad_norm": 0.11651453375816345,
736
+ "learning_rate": 0.00016162403988085147,
737
+ "loss": 0.0438153475522995,
738
+ "mean_token_accuracy": 0.9788163512945175,
739
+ "num_tokens": 7446501.0,
740
+ "step": 1460
741
+ },
742
+ {
743
+ "entropy": 0.04541895473375916,
744
+ "epoch": 0.9866666666666667,
745
+ "grad_norm": 0.10714145004749298,
746
+ "learning_rate": 0.0001604599114862375,
747
+ "loss": 0.043173199892044066,
748
+ "mean_token_accuracy": 0.9791891872882843,
749
+ "num_tokens": 7548187.0,
750
+ "step": 1480
751
+ },
752
+ {
753
+ "entropy": 0.04610758051276207,
754
+ "epoch": 1.0,
755
+ "grad_norm": 0.1056915670633316,
756
+ "learning_rate": 0.0001592827229710124,
757
+ "loss": 0.04365978240966797,
758
+ "mean_token_accuracy": 0.9787515595555305,
759
+ "num_tokens": 7650185.0,
760
+ "step": 1500
761
+ },
762
+ {
763
+ "entropy": 0.04553080843761563,
764
+ "epoch": 1.0133333333333334,
765
+ "grad_norm": 0.08358001708984375,
766
+ "learning_rate": 0.00015809272862307724,
767
+ "loss": 0.04281379580497742,
768
+ "mean_token_accuracy": 0.9787902727723121,
769
+ "num_tokens": 7751822.0,
770
+ "step": 1520
771
+ },
772
+ {
773
+ "entropy": 0.04557240409776568,
774
+ "epoch": 1.0266666666666666,
775
+ "grad_norm": 0.0894247367978096,
776
+ "learning_rate": 0.00015689018549655813,
777
+ "loss": 0.043633687496185306,
778
+ "mean_token_accuracy": 0.9793074056506157,
779
+ "num_tokens": 7853924.0,
780
+ "step": 1540
781
+ },
782
+ {
783
+ "entropy": 0.04621814098209143,
784
+ "epoch": 1.04,
785
+ "grad_norm": 0.060622621327638626,
786
+ "learning_rate": 0.00015567535335627916,
787
+ "loss": 0.043806785345077516,
788
+ "mean_token_accuracy": 0.9790619671344757,
789
+ "num_tokens": 7955729.0,
790
+ "step": 1560
791
+ },
792
+ {
793
+ "entropy": 0.04529289873316884,
794
+ "epoch": 1.0533333333333332,
795
+ "grad_norm": 0.06778731197118759,
796
+ "learning_rate": 0.0001544484946216499,
797
+ "loss": 0.04349397122859955,
798
+ "mean_token_accuracy": 0.9791216805577279,
799
+ "num_tokens": 8057521.0,
800
+ "step": 1580
801
+ },
802
+ {
803
+ "entropy": 0.045565437898039816,
804
+ "epoch": 1.0666666666666667,
805
+ "grad_norm": 0.09741676598787308,
806
+ "learning_rate": 0.00015320987430997939,
807
+ "loss": 0.043324217200279236,
808
+ "mean_token_accuracy": 0.9791115581989288,
809
+ "num_tokens": 8159337.0,
810
+ "step": 1600
811
+ },
812
+ {
813
+ "entropy": 0.04597685588523746,
814
+ "epoch": 1.08,
815
+ "grad_norm": 0.09679801762104034,
816
+ "learning_rate": 0.00015195975997922892,
817
+ "loss": 0.04302051663398743,
818
+ "mean_token_accuracy": 0.9793232962489128,
819
+ "num_tokens": 8262074.0,
820
+ "step": 1620
821
+ },
822
+ {
823
+ "entropy": 0.04526777658611536,
824
+ "epoch": 1.0933333333333333,
825
+ "grad_norm": 0.10501035302877426,
826
+ "learning_rate": 0.00015069842167021635,
827
+ "loss": 0.043459060788154605,
828
+ "mean_token_accuracy": 0.9790220081806182,
829
+ "num_tokens": 8363286.0,
830
+ "step": 1640
831
+ },
832
+ {
833
+ "entropy": 0.04562727101147175,
834
+ "epoch": 1.1066666666666667,
835
+ "grad_norm": 0.07695911824703217,
836
+ "learning_rate": 0.00014942613184828335,
837
+ "loss": 0.04361176192760467,
838
+ "mean_token_accuracy": 0.978962479531765,
839
+ "num_tokens": 8464992.0,
840
+ "step": 1660
841
+ },
842
+ {
843
+ "entropy": 0.04388966728001833,
844
+ "epoch": 1.12,
845
+ "grad_norm": 0.10466761142015457,
846
+ "learning_rate": 0.00014814316534443982,
847
+ "loss": 0.04218283891677856,
848
+ "mean_token_accuracy": 0.9791669443249702,
849
+ "num_tokens": 8567083.0,
850
+ "step": 1680
851
+ },
852
+ {
853
+ "entropy": 0.04554249225184322,
854
+ "epoch": 1.1333333333333333,
855
+ "grad_norm": 0.07236190885305405,
856
+ "learning_rate": 0.0001468497992959965,
857
+ "loss": 0.043398627638816835,
858
+ "mean_token_accuracy": 0.9791699111461639,
859
+ "num_tokens": 8669135.0,
860
+ "step": 1700
861
+ },
862
+ {
863
+ "entropy": 0.043595219124108554,
864
+ "epoch": 1.1466666666666667,
865
+ "grad_norm": 0.06271807104349136,
866
+ "learning_rate": 0.00014554631308669994,
867
+ "loss": 0.042030200362205505,
868
+ "mean_token_accuracy": 0.979636350274086,
869
+ "num_tokens": 8771085.0,
870
+ "step": 1720
871
+ },
872
+ {
873
+ "entropy": 0.04456626381725073,
874
+ "epoch": 1.16,
875
+ "grad_norm": 0.11451169848442078,
876
+ "learning_rate": 0.00014423298828638195,
877
+ "loss": 0.04222625195980072,
878
+ "mean_token_accuracy": 0.9794944658875465,
879
+ "num_tokens": 8873283.0,
880
+ "step": 1740
881
+ },
882
+ {
883
+ "entropy": 0.04446439165621996,
884
+ "epoch": 1.1733333333333333,
885
+ "grad_norm": 0.1023312583565712,
886
+ "learning_rate": 0.00014291010859013688,
887
+ "loss": 0.04255003333091736,
888
+ "mean_token_accuracy": 0.979724471271038,
889
+ "num_tokens": 8975472.0,
890
+ "step": 1760
891
+ },
892
+ {
893
+ "entropy": 0.04486837210133672,
894
+ "epoch": 1.1866666666666668,
895
+ "grad_norm": 0.10332223773002625,
896
+ "learning_rate": 0.00014157795975703986,
897
+ "loss": 0.04269057214260101,
898
+ "mean_token_accuracy": 0.9796782404184341,
899
+ "num_tokens": 9078026.0,
900
+ "step": 1780
901
+ },
902
+ {
903
+ "entropy": 0.04620604543015361,
904
+ "epoch": 1.2,
905
+ "grad_norm": 0.06070537120103836,
906
+ "learning_rate": 0.00014023682954841907,
907
+ "loss": 0.044662383198738095,
908
+ "mean_token_accuracy": 0.9784179985523224,
909
+ "num_tokens": 9180444.0,
910
+ "step": 1800
911
+ },
912
+ {
913
+ "entropy": 0.04559714160859585,
914
+ "epoch": 1.2133333333333334,
915
+ "grad_norm": 0.18560439348220825,
916
+ "learning_rate": 0.00013888700766569566,
917
+ "loss": 0.04349713623523712,
918
+ "mean_token_accuracy": 0.9794085487723351,
919
+ "num_tokens": 9282562.0,
920
+ "step": 1820
921
+ },
922
+ {
923
+ "entropy": 0.0467754821293056,
924
+ "epoch": 1.2266666666666666,
925
+ "grad_norm": 0.08615751564502716,
926
+ "learning_rate": 0.00013752878568780446,
927
+ "loss": 0.04393337666988373,
928
+ "mean_token_accuracy": 0.97873145788908,
929
+ "num_tokens": 9384267.0,
930
+ "step": 1840
931
+ },
932
+ {
933
+ "entropy": 0.04674078449606896,
934
+ "epoch": 1.24,
935
+ "grad_norm": 0.1094692274928093,
936
+ "learning_rate": 0.00013616245700820922,
937
+ "loss": 0.04425840079784393,
938
+ "mean_token_accuracy": 0.9783810645341873,
939
+ "num_tokens": 9486293.0,
940
+ "step": 1860
941
+ },
942
+ {
943
+ "entropy": 0.04517263481393456,
944
+ "epoch": 1.2533333333333334,
945
+ "grad_norm": 0.0624544620513916,
946
+ "learning_rate": 0.0001347883167715258,
947
+ "loss": 0.04288272559642792,
948
+ "mean_token_accuracy": 0.9790759727358818,
949
+ "num_tokens": 9587687.0,
950
+ "step": 1880
951
+ },
952
+ {
953
+ "entropy": 0.045213503576815126,
954
+ "epoch": 1.2666666666666666,
955
+ "grad_norm": 0.1179802417755127,
956
+ "learning_rate": 0.00013340666180976712,
957
+ "loss": 0.04305934309959412,
958
+ "mean_token_accuracy": 0.9792578309774399,
959
+ "num_tokens": 9689568.0,
960
+ "step": 1900
961
+ },
962
+ {
963
+ "entropy": 0.04414475904777646,
964
+ "epoch": 1.28,
965
+ "grad_norm": 0.10094133019447327,
966
+ "learning_rate": 0.0001320177905782236,
967
+ "loss": 0.04242780804634094,
968
+ "mean_token_accuracy": 0.9795284524559975,
969
+ "num_tokens": 9791805.0,
970
+ "step": 1920
971
+ },
972
+ {
973
+ "entropy": 0.04556956263259053,
974
+ "epoch": 1.2933333333333334,
975
+ "grad_norm": 0.07614333927631378,
976
+ "learning_rate": 0.0001306220030909931,
977
+ "loss": 0.043446135520935056,
978
+ "mean_token_accuracy": 0.9790474250912666,
979
+ "num_tokens": 9893871.0,
980
+ "step": 1940
981
+ },
982
+ {
983
+ "entropy": 0.04372665649279952,
984
+ "epoch": 1.3066666666666666,
985
+ "grad_norm": 0.09622333198785782,
986
+ "learning_rate": 0.00012921960085617373,
987
+ "loss": 0.04184481799602509,
988
+ "mean_token_accuracy": 0.979928120970726,
989
+ "num_tokens": 9995743.0,
990
+ "step": 1960
991
+ },
992
+ {
993
+ "entropy": 0.04449463188648224,
994
+ "epoch": 1.32,
995
+ "grad_norm": 0.08018497377634048,
996
+ "learning_rate": 0.0001278108868107346,
997
+ "loss": 0.043444639444351195,
998
+ "mean_token_accuracy": 0.979103796184063,
999
+ "num_tokens": 10097341.0,
1000
+ "step": 1980
1001
+ },
1002
+ {
1003
+ "entropy": 0.04594048615545034,
1004
+ "epoch": 1.3333333333333333,
1005
+ "grad_norm": 0.08098988234996796,
1006
+ "learning_rate": 0.00012639616525507717,
1007
+ "loss": 0.04326811134815216,
1008
+ "mean_token_accuracy": 0.9793805435299874,
1009
+ "num_tokens": 10199817.0,
1010
+ "step": 2000
1011
+ },
1012
+ {
1013
+ "entropy": 0.044195070117712024,
1014
+ "epoch": 1.3466666666666667,
1015
+ "grad_norm": 0.07928124070167542,
1016
+ "learning_rate": 0.00012497574178730266,
1017
+ "loss": 0.04292008876800537,
1018
+ "mean_token_accuracy": 0.979155270755291,
1019
+ "num_tokens": 10301704.0,
1020
+ "step": 2020
1021
+ },
1022
+ {
1023
+ "entropy": 0.04565720958635211,
1024
+ "epoch": 1.3599999999999999,
1025
+ "grad_norm": 0.07645630836486816,
1026
+ "learning_rate": 0.00012354992323719877,
1027
+ "loss": 0.04377688765525818,
1028
+ "mean_token_accuracy": 0.9790802374482155,
1029
+ "num_tokens": 10404032.0,
1030
+ "step": 2040
1031
+ },
1032
+ {
1033
+ "entropy": 0.044813665375113484,
1034
+ "epoch": 1.3733333333333333,
1035
+ "grad_norm": 0.0589720793068409,
1036
+ "learning_rate": 0.0001221190175999606,
1037
+ "loss": 0.04262206256389618,
1038
+ "mean_token_accuracy": 0.9795415893197059,
1039
+ "num_tokens": 10505610.0,
1040
+ "step": 2060
1041
+ },
1042
+ {
1043
+ "entropy": 0.04555217456072569,
1044
+ "epoch": 1.3866666666666667,
1045
+ "grad_norm": 0.11566988378763199,
1046
+ "learning_rate": 0.00012068333396965968,
1047
+ "loss": 0.04380977749824524,
1048
+ "mean_token_accuracy": 0.9788099125027656,
1049
+ "num_tokens": 10606782.0,
1050
+ "step": 2080
1051
+ },
1052
+ {
1053
+ "entropy": 0.04532764628529549,
1054
+ "epoch": 1.4,
1055
+ "grad_norm": 0.086255744099617,
1056
+ "learning_rate": 0.00011924318247247568,
1057
+ "loss": 0.04329647421836853,
1058
+ "mean_token_accuracy": 0.9791126802563668,
1059
+ "num_tokens": 10708263.0,
1060
+ "step": 2100
1061
+ },
1062
+ {
1063
+ "entropy": 0.04514106567949057,
1064
+ "epoch": 1.4133333333333333,
1065
+ "grad_norm": 0.06086282059550285,
1066
+ "learning_rate": 0.00011779887419970512,
1067
+ "loss": 0.04245937764644623,
1068
+ "mean_token_accuracy": 0.9797914355993271,
1069
+ "num_tokens": 10810300.0,
1070
+ "step": 2120
1071
+ },
1072
+ {
1073
+ "entropy": 0.04454901767894626,
1074
+ "epoch": 1.4266666666666667,
1075
+ "grad_norm": 0.07433643192052841,
1076
+ "learning_rate": 0.00011635072114056162,
1077
+ "loss": 0.043132221698760985,
1078
+ "mean_token_accuracy": 0.9791502475738525,
1079
+ "num_tokens": 10912165.0,
1080
+ "step": 2140
1081
+ },
1082
+ {
1083
+ "entropy": 0.04529751744121313,
1084
+ "epoch": 1.44,
1085
+ "grad_norm": 0.13444772362709045,
1086
+ "learning_rate": 0.00011489903611478229,
1087
+ "loss": 0.043829315900802614,
1088
+ "mean_token_accuracy": 0.9784928604960441,
1089
+ "num_tokens": 11014107.0,
1090
+ "step": 2160
1091
+ },
1092
+ {
1093
+ "entropy": 0.045276003703474996,
1094
+ "epoch": 1.4533333333333334,
1095
+ "grad_norm": 0.06211255118250847,
1096
+ "learning_rate": 0.00011344413270505457,
1097
+ "loss": 0.04307844340801239,
1098
+ "mean_token_accuracy": 0.9793669879436493,
1099
+ "num_tokens": 11116149.0,
1100
+ "step": 2180
1101
+ },
1102
+ {
1103
+ "entropy": 0.04517210628837347,
1104
+ "epoch": 1.4666666666666668,
1105
+ "grad_norm": 0.07761016488075256,
1106
+ "learning_rate": 0.00011198632518927832,
1107
+ "loss": 0.04319383502006531,
1108
+ "mean_token_accuracy": 0.9791072577238082,
1109
+ "num_tokens": 11217550.0,
1110
+ "step": 2200
1111
+ },
1112
+ {
1113
+ "entropy": 0.043730517756193875,
1114
+ "epoch": 1.48,
1115
+ "grad_norm": 0.08502429723739624,
1116
+ "learning_rate": 0.00011052592847267781,
1117
+ "loss": 0.0423270434141159,
1118
+ "mean_token_accuracy": 0.9796715095639229,
1119
+ "num_tokens": 11319372.0,
1120
+ "step": 2220
1121
+ },
1122
+ {
1123
+ "entropy": 0.04452117690816522,
1124
+ "epoch": 1.4933333333333334,
1125
+ "grad_norm": 0.06671646982431412,
1126
+ "learning_rate": 0.00010906325801977804,
1127
+ "loss": 0.04296606779098511,
1128
+ "mean_token_accuracy": 0.9795390352606773,
1129
+ "num_tokens": 11421402.0,
1130
+ "step": 2240
1131
+ },
1132
+ {
1133
+ "entropy": 0.04468898214399815,
1134
+ "epoch": 1.5066666666666668,
1135
+ "grad_norm": 0.08121279627084732,
1136
+ "learning_rate": 0.00010759862978626031,
1137
+ "loss": 0.04153239727020264,
1138
+ "mean_token_accuracy": 0.9799500927329063,
1139
+ "num_tokens": 11523747.0,
1140
+ "step": 2260
1141
+ },
1142
+ {
1143
+ "entropy": 0.04545955043286085,
1144
+ "epoch": 1.52,
1145
+ "grad_norm": 0.05693936347961426,
1146
+ "learning_rate": 0.00010613236015071195,
1147
+ "loss": 0.04396485388278961,
1148
+ "mean_token_accuracy": 0.9788213685154915,
1149
+ "num_tokens": 11625877.0,
1150
+ "step": 2280
1151
+ },
1152
+ {
1153
+ "entropy": 0.046351166628301146,
1154
+ "epoch": 1.5333333333333332,
1155
+ "grad_norm": 0.09166613221168518,
1156
+ "learning_rate": 0.00010466476584628413,
1157
+ "loss": 0.043498843908309937,
1158
+ "mean_token_accuracy": 0.9791526988148689,
1159
+ "num_tokens": 11727555.0,
1160
+ "step": 2300
1161
+ },
1162
+ {
1163
+ "entropy": 0.045797071792185305,
1164
+ "epoch": 1.5466666666666666,
1165
+ "grad_norm": 0.0821656882762909,
1166
+ "learning_rate": 0.00010319616389227369,
1167
+ "loss": 0.043224507570266725,
1168
+ "mean_token_accuracy": 0.9792197465896606,
1169
+ "num_tokens": 11829191.0,
1170
+ "step": 2320
1171
+ },
1172
+ {
1173
+ "entropy": 0.0452940653078258,
1174
+ "epoch": 1.56,
1175
+ "grad_norm": 0.07786799967288971,
1176
+ "learning_rate": 0.00010172687152564273,
1177
+ "loss": 0.04384516477584839,
1178
+ "mean_token_accuracy": 0.9784497052431107,
1179
+ "num_tokens": 11931301.0,
1180
+ "step": 2340
1181
+ },
1182
+ {
1183
+ "entropy": 0.04483237583190203,
1184
+ "epoch": 1.5733333333333333,
1185
+ "grad_norm": 0.08482241630554199,
1186
+ "learning_rate": 0.00010025720613249136,
1187
+ "loss": 0.04273432493209839,
1188
+ "mean_token_accuracy": 0.9794994488358497,
1189
+ "num_tokens": 12033500.0,
1190
+ "step": 2360
1191
+ },
1192
+ {
1193
+ "entropy": 0.045613698475062844,
1194
+ "epoch": 1.5866666666666667,
1195
+ "grad_norm": 0.0863715335726738,
1196
+ "learning_rate": 9.878748517949829e-05,
1197
+ "loss": 0.04371984004974365,
1198
+ "mean_token_accuracy": 0.9791261553764343,
1199
+ "num_tokens": 12135440.0,
1200
+ "step": 2380
1201
+ },
1202
+ {
1203
+ "entropy": 0.04589881300926209,
1204
+ "epoch": 1.6,
1205
+ "grad_norm": 0.062190357595682144,
1206
+ "learning_rate": 9.731802614534383e-05,
1207
+ "loss": 0.04390855133533478,
1208
+ "mean_token_accuracy": 0.9788092419505119,
1209
+ "num_tokens": 12237789.0,
1210
+ "step": 2400
1211
+ },
1212
+ {
1213
+ "entropy": 0.04429604625329375,
1214
+ "epoch": 1.6133333333333333,
1215
+ "grad_norm": 0.06404758989810944,
1216
+ "learning_rate": 9.584914645213045e-05,
1217
+ "loss": 0.042604264616966245,
1218
+ "mean_token_accuracy": 0.9796271160244941,
1219
+ "num_tokens": 12339966.0,
1220
+ "step": 2420
1221
+ },
1222
+ {
1223
+ "entropy": 0.04499910678714514,
1224
+ "epoch": 1.6266666666666667,
1225
+ "grad_norm": 0.06570903211832047,
1226
+ "learning_rate": 9.438116339681545e-05,
1227
+ "loss": 0.04222431182861328,
1228
+ "mean_token_accuracy": 0.9794401109218598,
1229
+ "num_tokens": 12441867.0,
1230
+ "step": 2440
1231
+ },
1232
+ {
1233
+ "entropy": 0.04458219092339277,
1234
+ "epoch": 1.6400000000000001,
1235
+ "grad_norm": 0.06039030849933624,
1236
+ "learning_rate": 9.291439408267093e-05,
1237
+ "loss": 0.04276288151741028,
1238
+ "mean_token_accuracy": 0.9794755399227142,
1239
+ "num_tokens": 12544334.0,
1240
+ "step": 2460
1241
+ },
1242
+ {
1243
+ "entropy": 0.04523820038884878,
1244
+ "epoch": 1.6533333333333333,
1245
+ "grad_norm": 0.09730029851198196,
1246
+ "learning_rate": 9.144915535078509e-05,
1247
+ "loss": 0.043028077483177184,
1248
+ "mean_token_accuracy": 0.9791945442557335,
1249
+ "num_tokens": 12646733.0,
1250
+ "step": 2480
1251
+ },
1252
+ {
1253
+ "entropy": 0.04477119510993362,
1254
+ "epoch": 1.6666666666666665,
1255
+ "grad_norm": 0.0753539651632309,
1256
+ "learning_rate": 8.998576371162073e-05,
1257
+ "loss": 0.04317043125629425,
1258
+ "mean_token_accuracy": 0.9792640700936317,
1259
+ "num_tokens": 12748659.0,
1260
+ "step": 2500
1261
+ },
1262
+ {
1263
+ "entropy": 0.044788467884063723,
1264
+ "epoch": 1.6800000000000002,
1265
+ "grad_norm": 0.07562968134880066,
1266
+ "learning_rate": 8.852453527664466e-05,
1267
+ "loss": 0.04256285130977631,
1268
+ "mean_token_accuracy": 0.979301193356514,
1269
+ "num_tokens": 12850375.0,
1270
+ "step": 2520
1271
+ },
1272
+ {
1273
+ "entropy": 0.045563530456274745,
1274
+ "epoch": 1.6933333333333334,
1275
+ "grad_norm": 0.08481646329164505,
1276
+ "learning_rate": 8.706578569004392e-05,
1277
+ "loss": 0.043007442355155946,
1278
+ "mean_token_accuracy": 0.9794534996151925,
1279
+ "num_tokens": 12952926.0,
1280
+ "step": 2540
1281
+ },
1282
+ {
1283
+ "entropy": 0.04439763380214572,
1284
+ "epoch": 1.7066666666666666,
1285
+ "grad_norm": 0.07377834618091583,
1286
+ "learning_rate": 8.560983006054208e-05,
1287
+ "loss": 0.04233894348144531,
1288
+ "mean_token_accuracy": 0.9793659463524819,
1289
+ "num_tokens": 13055094.0,
1290
+ "step": 2560
1291
+ },
1292
+ {
1293
+ "entropy": 0.04448066912591457,
1294
+ "epoch": 1.72,
1295
+ "grad_norm": 0.06845632195472717,
1296
+ "learning_rate": 8.415698289333213e-05,
1297
+ "loss": 0.04230453968048096,
1298
+ "mean_token_accuracy": 0.9793373107910156,
1299
+ "num_tokens": 13157565.0,
1300
+ "step": 2580
1301
+ },
1302
+ {
1303
+ "entropy": 0.04516846965998411,
1304
+ "epoch": 1.7333333333333334,
1305
+ "grad_norm": 0.0826217532157898,
1306
+ "learning_rate": 8.270755802213896e-05,
1307
+ "loss": 0.043338698148727414,
1308
+ "mean_token_accuracy": 0.9791581705212593,
1309
+ "num_tokens": 13259373.0,
1310
+ "step": 2600
1311
+ },
1312
+ {
1313
+ "entropy": 0.045483655855059625,
1314
+ "epoch": 1.7466666666666666,
1315
+ "grad_norm": 0.09278784692287445,
1316
+ "learning_rate": 8.126186854142752e-05,
1317
+ "loss": 0.043374094367027285,
1318
+ "mean_token_accuracy": 0.9789844870567321,
1319
+ "num_tokens": 13361653.0,
1320
+ "step": 2620
1321
+ },
1322
+ {
1323
+ "entropy": 0.044713820703327654,
1324
+ "epoch": 1.76,
1325
+ "grad_norm": 0.06657784432172775,
1326
+ "learning_rate": 7.982022673877022e-05,
1327
+ "loss": 0.04237607717514038,
1328
+ "mean_token_accuracy": 0.9793095976114273,
1329
+ "num_tokens": 13463283.0,
1330
+ "step": 2640
1331
+ },
1332
+ {
1333
+ "entropy": 0.044877147488296035,
1334
+ "epoch": 1.7733333333333334,
1335
+ "grad_norm": 0.08266546577215195,
1336
+ "learning_rate": 7.838294402738875e-05,
1337
+ "loss": 0.04311709105968475,
1338
+ "mean_token_accuracy": 0.9791682615876198,
1339
+ "num_tokens": 13565428.0,
1340
+ "step": 2660
1341
+ },
1342
+ {
1343
+ "entropy": 0.04468537019565701,
1344
+ "epoch": 1.7866666666666666,
1345
+ "grad_norm": 0.07597433030605316,
1346
+ "learning_rate": 7.695033087888489e-05,
1347
+ "loss": 0.0424690306186676,
1348
+ "mean_token_accuracy": 0.9796170979738236,
1349
+ "num_tokens": 13667448.0,
1350
+ "step": 2680
1351
+ },
1352
+ {
1353
+ "entropy": 0.04455111119896173,
1354
+ "epoch": 1.8,
1355
+ "grad_norm": 0.06538581848144531,
1356
+ "learning_rate": 7.55226967561746e-05,
1357
+ "loss": 0.04193790853023529,
1358
+ "mean_token_accuracy": 0.9794035986065864,
1359
+ "num_tokens": 13769362.0,
1360
+ "step": 2700
1361
+ },
1362
+ {
1363
+ "entropy": 0.043454491440206765,
1364
+ "epoch": 1.8133333333333335,
1365
+ "grad_norm": 0.05730016157031059,
1366
+ "learning_rate": 7.410035004664011e-05,
1367
+ "loss": 0.04141553640365601,
1368
+ "mean_token_accuracy": 0.9800622522830963,
1369
+ "num_tokens": 13871782.0,
1370
+ "step": 2720
1371
+ },
1372
+ {
1373
+ "entropy": 0.044676115922629836,
1374
+ "epoch": 1.8266666666666667,
1375
+ "grad_norm": 0.04646085202693939,
1376
+ "learning_rate": 7.268359799551416e-05,
1377
+ "loss": 0.04284192621707916,
1378
+ "mean_token_accuracy": 0.9793128624558449,
1379
+ "num_tokens": 13973630.0,
1380
+ "step": 2740
1381
+ },
1382
+ {
1383
+ "entropy": 0.04494037302210927,
1384
+ "epoch": 1.8399999999999999,
1385
+ "grad_norm": 0.09230729192495346,
1386
+ "learning_rate": 7.12727466395112e-05,
1387
+ "loss": 0.043046200275421144,
1388
+ "mean_token_accuracy": 0.9793307974934577,
1389
+ "num_tokens": 14075906.0,
1390
+ "step": 2760
1391
+ },
1392
+ {
1393
+ "entropy": 0.045368336327373984,
1394
+ "epoch": 1.8533333333333335,
1395
+ "grad_norm": 0.04331463947892189,
1396
+ "learning_rate": 6.986810074071932e-05,
1397
+ "loss": 0.042864075303077696,
1398
+ "mean_token_accuracy": 0.978898110985756,
1399
+ "num_tokens": 14177856.0,
1400
+ "step": 2780
1401
+ },
1402
+ {
1403
+ "entropy": 0.04510376630350947,
1404
+ "epoch": 1.8666666666666667,
1405
+ "grad_norm": 0.09033851325511932,
1406
+ "learning_rate": 6.846996372076786e-05,
1407
+ "loss": 0.04259768426418305,
1408
+ "mean_token_accuracy": 0.9792723521590233,
1409
+ "num_tokens": 14280019.0,
1410
+ "step": 2800
1411
+ },
1412
+ {
1413
+ "entropy": 0.04520597280934453,
1414
+ "epoch": 1.88,
1415
+ "grad_norm": 0.04347246140241623,
1416
+ "learning_rate": 6.707863759528446e-05,
1417
+ "loss": 0.043121880292892455,
1418
+ "mean_token_accuracy": 0.9790245160460472,
1419
+ "num_tokens": 14382127.0,
1420
+ "step": 2820
1421
+ },
1422
+ {
1423
+ "entropy": 0.045137868728488684,
1424
+ "epoch": 1.8933333333333333,
1425
+ "grad_norm": 0.08444561064243317,
1426
+ "learning_rate": 6.569442290865564e-05,
1427
+ "loss": 0.042786693572998045,
1428
+ "mean_token_accuracy": 0.9794920086860657,
1429
+ "num_tokens": 14484156.0,
1430
+ "step": 2840
1431
+ },
1432
+ {
1433
+ "entropy": 0.0450214795768261,
1434
+ "epoch": 1.9066666666666667,
1435
+ "grad_norm": 0.06270349770784378,
1436
+ "learning_rate": 6.431761866910549e-05,
1437
+ "loss": 0.04266757369041443,
1438
+ "mean_token_accuracy": 0.9790657863020897,
1439
+ "num_tokens": 14586261.0,
1440
+ "step": 2860
1441
+ },
1442
+ {
1443
+ "entropy": 0.04571379153057933,
1444
+ "epoch": 1.92,
1445
+ "grad_norm": 0.059830646961927414,
1446
+ "learning_rate": 6.294852228410585e-05,
1447
+ "loss": 0.043165019154548644,
1448
+ "mean_token_accuracy": 0.9789528846740723,
1449
+ "num_tokens": 14688252.0,
1450
+ "step": 2880
1451
+ },
1452
+ {
1453
+ "entropy": 0.04564494509249926,
1454
+ "epoch": 1.9333333333333333,
1455
+ "grad_norm": 0.2881755828857422,
1456
+ "learning_rate": 6.158742949613263e-05,
1457
+ "loss": 0.042789730429649356,
1458
+ "mean_token_accuracy": 0.9789565414190292,
1459
+ "num_tokens": 14790706.0,
1460
+ "step": 2900
1461
+ },
1462
+ {
1463
+ "entropy": 0.04481498738750815,
1464
+ "epoch": 1.9466666666666668,
1465
+ "grad_norm": 0.0739307701587677,
1466
+ "learning_rate": 6.023463431878159e-05,
1467
+ "loss": 0.04184747338294983,
1468
+ "mean_token_accuracy": 0.9795544907450676,
1469
+ "num_tokens": 14892667.0,
1470
+ "step": 2920
1471
+ },
1472
+ {
1473
+ "entropy": 0.045400716736912726,
1474
+ "epoch": 1.96,
1475
+ "grad_norm": 0.0694345086812973,
1476
+ "learning_rate": 5.889042897325755e-05,
1477
+ "loss": 0.04274559020996094,
1478
+ "mean_token_accuracy": 0.9791734784841537,
1479
+ "num_tokens": 14994588.0,
1480
+ "step": 2940
1481
+ },
1482
+ {
1483
+ "entropy": 0.045871376898139714,
1484
+ "epoch": 1.9733333333333334,
1485
+ "grad_norm": 0.06866899877786636,
1486
+ "learning_rate": 5.7555103825250914e-05,
1487
+ "loss": 0.043129801750183105,
1488
+ "mean_token_accuracy": 0.979410058259964,
1489
+ "num_tokens": 15096814.0,
1490
+ "step": 2960
1491
+ },
1492
+ {
1493
+ "entropy": 0.04594316426664591,
1494
+ "epoch": 1.9866666666666668,
1495
+ "grad_norm": 0.07196313887834549,
1496
+ "learning_rate": 5.622894732221482e-05,
1497
+ "loss": 0.04333162605762482,
1498
+ "mean_token_accuracy": 0.9789909616112709,
1499
+ "num_tokens": 15198781.0,
1500
+ "step": 2980
1501
+ },
1502
+ {
1503
+ "entropy": 0.046280243806540965,
1504
+ "epoch": 2.0,
1505
+ "grad_norm": 0.07306694984436035,
1506
+ "learning_rate": 5.491224593105695e-05,
1507
+ "loss": 0.04286535978317261,
1508
+ "mean_token_accuracy": 0.9792644336819649,
1509
+ "num_tokens": 15300370.0,
1510
+ "step": 3000
1511
+ },
1512
+ {
1513
+ "entropy": 0.044749976880848405,
1514
+ "epoch": 2.013333333333333,
1515
+ "grad_norm": 0.06247550994157791,
1516
+ "learning_rate": 5.360528407625873e-05,
1517
+ "loss": 0.04155576527118683,
1518
+ "mean_token_accuracy": 0.979676017165184,
1519
+ "num_tokens": 15402333.0,
1520
+ "step": 3020
1521
+ },
1522
+ {
1523
+ "entropy": 0.045135741028934716,
1524
+ "epoch": 2.026666666666667,
1525
+ "grad_norm": 0.09815753251314163,
1526
+ "learning_rate": 5.2308344078436344e-05,
1527
+ "loss": 0.042350149154663085,
1528
+ "mean_token_accuracy": 0.979559974372387,
1529
+ "num_tokens": 15504158.0,
1530
+ "step": 3040
1531
+ },
1532
+ {
1533
+ "entropy": 0.045068098604679106,
1534
+ "epoch": 2.04,
1535
+ "grad_norm": 0.09551538527011871,
1536
+ "learning_rate": 5.1021706093355414e-05,
1537
+ "loss": 0.04268674254417419,
1538
+ "mean_token_accuracy": 0.9792046830058098,
1539
+ "num_tokens": 15605979.0,
1540
+ "step": 3060
1541
+ },
1542
+ {
1543
+ "entropy": 0.0467217774130404,
1544
+ "epoch": 2.0533333333333332,
1545
+ "grad_norm": 0.0750860869884491,
1546
+ "learning_rate": 4.974564805141405e-05,
1547
+ "loss": 0.04325474202632904,
1548
+ "mean_token_accuracy": 0.9788183540105819,
1549
+ "num_tokens": 15708226.0,
1550
+ "step": 3080
1551
+ },
1552
+ {
1553
+ "entropy": 0.045709628332406285,
1554
+ "epoch": 2.066666666666667,
1555
+ "grad_norm": 0.08207862824201584,
1556
+ "learning_rate": 4.848044559760624e-05,
1557
+ "loss": 0.043493375182151794,
1558
+ "mean_token_accuracy": 0.9793010488152504,
1559
+ "num_tokens": 15810035.0,
1560
+ "step": 3100
1561
+ },
1562
+ {
1563
+ "entropy": 0.04442885173484683,
1564
+ "epoch": 2.08,
1565
+ "grad_norm": 0.06018839031457901,
1566
+ "learning_rate": 4.7226372031978735e-05,
1567
+ "loss": 0.0418207585811615,
1568
+ "mean_token_accuracy": 0.9797791764140129,
1569
+ "num_tokens": 15912192.0,
1570
+ "step": 3120
1571
+ },
1572
+ {
1573
+ "entropy": 0.046121115796267986,
1574
+ "epoch": 2.0933333333333333,
1575
+ "grad_norm": 0.06739337742328644,
1576
+ "learning_rate": 4.598369825059522e-05,
1577
+ "loss": 0.04348099529743195,
1578
+ "mean_token_accuracy": 0.9789452716708184,
1579
+ "num_tokens": 16013752.0,
1580
+ "step": 3140
1581
+ },
1582
+ {
1583
+ "entropy": 0.04560723854228854,
1584
+ "epoch": 2.1066666666666665,
1585
+ "grad_norm": 0.05784814432263374,
1586
+ "learning_rate": 4.475269268701868e-05,
1587
+ "loss": 0.04268187880516052,
1588
+ "mean_token_accuracy": 0.9791408717632294,
1589
+ "num_tokens": 16115637.0,
1590
+ "step": 3160
1591
+ },
1592
+ {
1593
+ "entropy": 0.045645091123878954,
1594
+ "epoch": 2.12,
1595
+ "grad_norm": 0.05607442185282707,
1596
+ "learning_rate": 4.353362125432674e-05,
1597
+ "loss": 0.042373275756835936,
1598
+ "mean_token_accuracy": 0.979694114625454,
1599
+ "num_tokens": 16217990.0,
1600
+ "step": 3180
1601
+ },
1602
+ {
1603
+ "entropy": 0.04457983383908868,
1604
+ "epoch": 2.1333333333333333,
1605
+ "grad_norm": 0.09050878137350082,
1606
+ "learning_rate": 4.232674728767082e-05,
1607
+ "loss": 0.042291298508644104,
1608
+ "mean_token_accuracy": 0.9795105144381523,
1609
+ "num_tokens": 16319781.0,
1610
+ "step": 3200
1611
+ },
1612
+ {
1613
+ "entropy": 0.04519128203392029,
1614
+ "epoch": 2.1466666666666665,
1615
+ "grad_norm": 0.06114558130502701,
1616
+ "learning_rate": 4.113233148739224e-05,
1617
+ "loss": 0.04246037602424622,
1618
+ "mean_token_accuracy": 0.9795787811279297,
1619
+ "num_tokens": 16422036.0,
1620
+ "step": 3220
1621
+ },
1622
+ {
1623
+ "entropy": 0.045624539349228145,
1624
+ "epoch": 2.16,
1625
+ "grad_norm": 0.06515778601169586,
1626
+ "learning_rate": 3.9950631862707964e-05,
1627
+ "loss": 0.04316512644290924,
1628
+ "mean_token_accuracy": 0.9788484647870064,
1629
+ "num_tokens": 16524417.0,
1630
+ "step": 3240
1631
+ },
1632
+ {
1633
+ "entropy": 0.04569779820740223,
1634
+ "epoch": 2.1733333333333333,
1635
+ "grad_norm": 0.08130136877298355,
1636
+ "learning_rate": 3.8781903675976775e-05,
1637
+ "loss": 0.04316212832927704,
1638
+ "mean_token_accuracy": 0.9789097234606743,
1639
+ "num_tokens": 16626474.0,
1640
+ "step": 3260
1641
+ },
1642
+ {
1643
+ "entropy": 0.04466199018061161,
1644
+ "epoch": 2.1866666666666665,
1645
+ "grad_norm": 0.06522400677204132,
1646
+ "learning_rate": 3.762639938755974e-05,
1647
+ "loss": 0.04167875051498413,
1648
+ "mean_token_accuracy": 0.979556742310524,
1649
+ "num_tokens": 16728484.0,
1650
+ "step": 3280
1651
+ },
1652
+ {
1653
+ "entropy": 0.044957845285534856,
1654
+ "epoch": 2.2,
1655
+ "grad_norm": 0.07835223525762558,
1656
+ "learning_rate": 3.648436860128525e-05,
1657
+ "loss": 0.041939809918403625,
1658
+ "mean_token_accuracy": 0.9797166779637336,
1659
+ "num_tokens": 16830621.0,
1660
+ "step": 3300
1661
+ },
1662
+ {
1663
+ "entropy": 0.04469237914308906,
1664
+ "epoch": 2.2133333333333334,
1665
+ "grad_norm": 0.07076659053564072,
1666
+ "learning_rate": 3.535605801053147e-05,
1667
+ "loss": 0.04294973611831665,
1668
+ "mean_token_accuracy": 0.9787584990262985,
1669
+ "num_tokens": 16932449.0,
1670
+ "step": 3320
1671
+ },
1672
+ {
1673
+ "entropy": 0.044177047722041604,
1674
+ "epoch": 2.2266666666666666,
1675
+ "grad_norm": 0.0865534245967865,
1676
+ "learning_rate": 3.424171134493756e-05,
1677
+ "loss": 0.041136741638183594,
1678
+ "mean_token_accuracy": 0.9797752141952515,
1679
+ "num_tokens": 17034746.0,
1680
+ "step": 3340
1681
+ },
1682
+ {
1683
+ "entropy": 0.044158230628818275,
1684
+ "epoch": 2.24,
1685
+ "grad_norm": 0.09348734468221664,
1686
+ "learning_rate": 3.314156931775449e-05,
1687
+ "loss": 0.04184678792953491,
1688
+ "mean_token_accuracy": 0.979484710097313,
1689
+ "num_tokens": 17137032.0,
1690
+ "step": 3360
1691
+ },
1692
+ {
1693
+ "entropy": 0.04505048170685768,
1694
+ "epoch": 2.2533333333333334,
1695
+ "grad_norm": 0.04819338768720627,
1696
+ "learning_rate": 3.205586957384838e-05,
1697
+ "loss": 0.04278863370418549,
1698
+ "mean_token_accuracy": 0.9789488822221756,
1699
+ "num_tokens": 17238981.0,
1700
+ "step": 3380
1701
+ },
1702
+ {
1703
+ "entropy": 0.044143668562173846,
1704
+ "epoch": 2.2666666666666666,
1705
+ "grad_norm": 0.08243514597415924,
1706
+ "learning_rate": 3.09848466383657e-05,
1707
+ "loss": 0.04165869653224945,
1708
+ "mean_token_accuracy": 0.9797174796462059,
1709
+ "num_tokens": 17341204.0,
1710
+ "step": 3400
1711
+ },
1712
+ {
1713
+ "entropy": 0.04463189765810967,
1714
+ "epoch": 2.2800000000000002,
1715
+ "grad_norm": 0.06700066477060318,
1716
+ "learning_rate": 2.9928731866073135e-05,
1717
+ "loss": 0.041824132204055786,
1718
+ "mean_token_accuracy": 0.9796530723571777,
1719
+ "num_tokens": 17443109.0,
1720
+ "step": 3420
1721
+ },
1722
+ {
1723
+ "entropy": 0.044507946353405714,
1724
+ "epoch": 2.2933333333333334,
1725
+ "grad_norm": 0.059370577335357666,
1726
+ "learning_rate": 2.8887753391381924e-05,
1727
+ "loss": 0.04232283234596253,
1728
+ "mean_token_accuracy": 0.9795172438025475,
1729
+ "num_tokens": 17544670.0,
1730
+ "step": 3440
1731
+ },
1732
+ {
1733
+ "entropy": 0.04427545545622706,
1734
+ "epoch": 2.3066666666666666,
1735
+ "grad_norm": 0.08195611089468002,
1736
+ "learning_rate": 2.7862136079067646e-05,
1737
+ "loss": 0.042314866185188295,
1738
+ "mean_token_accuracy": 0.9798214435577393,
1739
+ "num_tokens": 17647059.0,
1740
+ "step": 3460
1741
+ },
1742
+ {
1743
+ "entropy": 0.04503831313923001,
1744
+ "epoch": 2.32,
1745
+ "grad_norm": 0.06154360994696617,
1746
+ "learning_rate": 2.6852101475696843e-05,
1747
+ "loss": 0.04239094257354736,
1748
+ "mean_token_accuracy": 0.979605621099472,
1749
+ "num_tokens": 17749005.0,
1750
+ "step": 3480
1751
+ },
1752
+ {
1753
+ "entropy": 0.04526049355044961,
1754
+ "epoch": 2.3333333333333335,
1755
+ "grad_norm": 0.07333716750144958,
1756
+ "learning_rate": 2.585786776176985e-05,
1757
+ "loss": 0.04255903661251068,
1758
+ "mean_token_accuracy": 0.9788812786340714,
1759
+ "num_tokens": 17851383.0,
1760
+ "step": 3500
1761
+ },
1762
+ {
1763
+ "entropy": 0.04530645264312625,
1764
+ "epoch": 2.3466666666666667,
1765
+ "grad_norm": 0.06190125271677971,
1766
+ "learning_rate": 2.487964970459118e-05,
1767
+ "loss": 0.042575931549072264,
1768
+ "mean_token_accuracy": 0.9791432306170463,
1769
+ "num_tokens": 17953577.0,
1770
+ "step": 3520
1771
+ },
1772
+ {
1773
+ "entropy": 0.04435355756431818,
1774
+ "epoch": 2.36,
1775
+ "grad_norm": 0.08465747535228729,
1776
+ "learning_rate": 2.3917658611876904e-05,
1777
+ "loss": 0.04138871431350708,
1778
+ "mean_token_accuracy": 0.9799614399671555,
1779
+ "num_tokens": 18055293.0,
1780
+ "step": 3540
1781
+ },
1782
+ {
1783
+ "entropy": 0.04456534581258893,
1784
+ "epoch": 2.3733333333333335,
1785
+ "grad_norm": 0.0772717297077179,
1786
+ "learning_rate": 2.297210228610952e-05,
1787
+ "loss": 0.04198825061321258,
1788
+ "mean_token_accuracy": 0.9794510439038276,
1789
+ "num_tokens": 18157289.0,
1790
+ "step": 3560
1791
+ },
1792
+ {
1793
+ "entropy": 0.04461102448403835,
1794
+ "epoch": 2.3866666666666667,
1795
+ "grad_norm": 0.08000056445598602,
1796
+ "learning_rate": 2.2043184979649933e-05,
1797
+ "loss": 0.041901758313179015,
1798
+ "mean_token_accuracy": 0.9796808436512947,
1799
+ "num_tokens": 18258778.0,
1800
+ "step": 3580
1801
+ },
1802
+ {
1803
+ "entropy": 0.04491544393822551,
1804
+ "epoch": 2.4,
1805
+ "grad_norm": 0.0720711350440979,
1806
+ "learning_rate": 2.1131107350616187e-05,
1807
+ "loss": 0.042588868737220766,
1808
+ "mean_token_accuracy": 0.9793313190340995,
1809
+ "num_tokens": 18360839.0,
1810
+ "step": 3600
1811
+ },
1812
+ {
1813
+ "entropy": 0.045493978820741174,
1814
+ "epoch": 2.413333333333333,
1815
+ "grad_norm": 0.09875239431858063,
1816
+ "learning_rate": 2.0236066419538934e-05,
1817
+ "loss": 0.04313438236713409,
1818
+ "mean_token_accuracy": 0.9793697372078896,
1819
+ "num_tokens": 18462252.0,
1820
+ "step": 3620
1821
+ },
1822
+ {
1823
+ "entropy": 0.04539180537685752,
1824
+ "epoch": 2.4266666666666667,
1825
+ "grad_norm": 0.04752529039978981,
1826
+ "learning_rate": 1.9358255526802303e-05,
1827
+ "loss": 0.041815349459648134,
1828
+ "mean_token_accuracy": 0.9794102787971497,
1829
+ "num_tokens": 18564453.0,
1830
+ "step": 3640
1831
+ },
1832
+ {
1833
+ "entropy": 0.044612882751971485,
1834
+ "epoch": 2.44,
1835
+ "grad_norm": 0.05158265680074692,
1836
+ "learning_rate": 1.8497864290879953e-05,
1837
+ "loss": 0.04235563278198242,
1838
+ "mean_token_accuracy": 0.9792704641819,
1839
+ "num_tokens": 18666497.0,
1840
+ "step": 3660
1841
+ },
1842
+ {
1843
+ "entropy": 0.045019051525741816,
1844
+ "epoch": 2.453333333333333,
1845
+ "grad_norm": 0.0648743286728859,
1846
+ "learning_rate": 1.7655078567375028e-05,
1847
+ "loss": 0.04204939901828766,
1848
+ "mean_token_accuracy": 0.9794104173779488,
1849
+ "num_tokens": 18768455.0,
1850
+ "step": 3680
1851
+ },
1852
+ {
1853
+ "entropy": 0.04469795366749167,
1854
+ "epoch": 2.466666666666667,
1855
+ "grad_norm": 0.05884250998497009,
1856
+ "learning_rate": 1.683008040887285e-05,
1857
+ "loss": 0.04209013283252716,
1858
+ "mean_token_accuracy": 0.9796774923801422,
1859
+ "num_tokens": 18870275.0,
1860
+ "step": 3700
1861
+ },
1862
+ {
1863
+ "entropy": 0.04474199656397104,
1864
+ "epoch": 2.48,
1865
+ "grad_norm": 0.051543645560741425,
1866
+ "learning_rate": 1.6023048025615405e-05,
1867
+ "loss": 0.04179444909095764,
1868
+ "mean_token_accuracy": 0.9795808404684067,
1869
+ "num_tokens": 18972156.0,
1870
+ "step": 3720
1871
+ },
1872
+ {
1873
+ "entropy": 0.04483764311298728,
1874
+ "epoch": 2.493333333333333,
1875
+ "grad_norm": 0.10630819946527481,
1876
+ "learning_rate": 1.5234155747005486e-05,
1877
+ "loss": 0.042180657386779785,
1878
+ "mean_token_accuracy": 0.9794986173510551,
1879
+ "num_tokens": 19074197.0,
1880
+ "step": 3740
1881
+ },
1882
+ {
1883
+ "entropy": 0.04558736402541399,
1884
+ "epoch": 2.506666666666667,
1885
+ "grad_norm": 0.08093755692243576,
1886
+ "learning_rate": 1.4463573983949341e-05,
1887
+ "loss": 0.04298904240131378,
1888
+ "mean_token_accuracy": 0.9790481492877007,
1889
+ "num_tokens": 19176367.0,
1890
+ "step": 3760
1891
+ },
1892
+ {
1893
+ "entropy": 0.04453156525269151,
1894
+ "epoch": 2.52,
1895
+ "grad_norm": 0.0727071687579155,
1896
+ "learning_rate": 1.3711469192045723e-05,
1897
+ "loss": 0.041091355681419375,
1898
+ "mean_token_accuracy": 0.9804318726062775,
1899
+ "num_tokens": 19278992.0,
1900
+ "step": 3780
1901
+ },
1902
+ {
1903
+ "entropy": 0.04554087147116661,
1904
+ "epoch": 2.533333333333333,
1905
+ "grad_norm": 0.0910055935382843,
1906
+ "learning_rate": 1.297800383562926e-05,
1907
+ "loss": 0.04345537126064301,
1908
+ "mean_token_accuracy": 0.9786257922649384,
1909
+ "num_tokens": 19380593.0,
1910
+ "step": 3800
1911
+ },
1912
+ {
1913
+ "entropy": 0.04596257032826543,
1914
+ "epoch": 2.546666666666667,
1915
+ "grad_norm": 0.0877053365111351,
1916
+ "learning_rate": 1.2263336352676235e-05,
1917
+ "loss": 0.04255788326263428,
1918
+ "mean_token_accuracy": 0.9795473828911782,
1919
+ "num_tokens": 19482278.0,
1920
+ "step": 3820
1921
+ },
1922
+ {
1923
+ "entropy": 0.044655687548220156,
1924
+ "epoch": 2.56,
1925
+ "grad_norm": 0.10276857763528824,
1926
+ "learning_rate": 1.1567621120579753e-05,
1927
+ "loss": 0.0418385773897171,
1928
+ "mean_token_accuracy": 0.9795376226305962,
1929
+ "num_tokens": 19584297.0,
1930
+ "step": 3840
1931
+ },
1932
+ {
1933
+ "entropy": 0.04575161607936025,
1934
+ "epoch": 2.5733333333333333,
1935
+ "grad_norm": 0.09059888869524002,
1936
+ "learning_rate": 1.089100842280234e-05,
1937
+ "loss": 0.042618009448051455,
1938
+ "mean_token_accuracy": 0.9796013042330742,
1939
+ "num_tokens": 19686257.0,
1940
+ "step": 3860
1941
+ },
1942
+ {
1943
+ "entropy": 0.04560979856178164,
1944
+ "epoch": 2.586666666666667,
1945
+ "grad_norm": 0.048925597220659256,
1946
+ "learning_rate": 1.0233644416412791e-05,
1947
+ "loss": 0.04292104840278625,
1948
+ "mean_token_accuracy": 0.9794995337724686,
1949
+ "num_tokens": 19788450.0,
1950
+ "step": 3880
1951
+ },
1952
+ {
1953
+ "entropy": 0.0455952113494277,
1954
+ "epoch": 2.6,
1955
+ "grad_norm": 0.048526402562856674,
1956
+ "learning_rate": 9.595671100514214e-06,
1957
+ "loss": 0.042637795209884644,
1958
+ "mean_token_accuracy": 0.9797911092638969,
1959
+ "num_tokens": 19890524.0,
1960
+ "step": 3900
1961
+ },
1962
+ {
1963
+ "entropy": 0.04548884928226471,
1964
+ "epoch": 2.6133333333333333,
1965
+ "grad_norm": 0.06042620167136192,
1966
+ "learning_rate": 8.977226285570606e-06,
1967
+ "loss": 0.04222815930843353,
1968
+ "mean_token_accuracy": 0.9794741749763489,
1969
+ "num_tokens": 19992209.0,
1970
+ "step": 3920
1971
+ },
1972
+ {
1973
+ "entropy": 0.045671455282717946,
1974
+ "epoch": 2.626666666666667,
1975
+ "grad_norm": 0.07702252268791199,
1976
+ "learning_rate": 8.378443563637828e-06,
1977
+ "loss": 0.042873308062553406,
1978
+ "mean_token_accuracy": 0.9794026196002961,
1979
+ "num_tokens": 20093703.0,
1980
+ "step": 3940
1981
+ },
1982
+ {
1983
+ "entropy": 0.04522231016308069,
1984
+ "epoch": 2.64,
1985
+ "grad_norm": 0.07133087515830994,
1986
+ "learning_rate": 7.799452279506125e-06,
1987
+ "loss": 0.042153152823448184,
1988
+ "mean_token_accuracy": 0.9797803938388825,
1989
+ "num_tokens": 20195947.0,
1990
+ "step": 3960
1991
+ },
1992
+ {
1993
+ "entropy": 0.04628952695056796,
1994
+ "epoch": 2.6533333333333333,
1995
+ "grad_norm": 0.06586236506700516,
1996
+ "learning_rate": 7.240377502759932e-06,
1997
+ "loss": 0.043617674708366395,
1998
+ "mean_token_accuracy": 0.9784920737147331,
1999
+ "num_tokens": 20298043.0,
2000
+ "step": 3980
2001
+ },
2002
+ {
2003
+ "entropy": 0.045405203476548195,
2004
+ "epoch": 2.6666666666666665,
2005
+ "grad_norm": 0.06839724630117416,
2006
+ "learning_rate": 6.70134000076118e-06,
2007
+ "loss": 0.04227378368377686,
2008
+ "mean_token_accuracy": 0.979735977947712,
2009
+ "num_tokens": 20399972.0,
2010
+ "step": 4000
2011
+ },
2012
+ {
2013
+ "entropy": 0.045020535588264465,
2014
+ "epoch": 2.68,
2015
+ "grad_norm": 0.07815848290920258,
2016
+ "learning_rate": 6.182456212562093e-06,
2017
+ "loss": 0.04192916452884674,
2018
+ "mean_token_accuracy": 0.9796771243214607,
2019
+ "num_tokens": 20501675.0,
2020
+ "step": 4020
2021
+ },
2022
+ {
2023
+ "entropy": 0.04609425235539675,
2024
+ "epoch": 2.6933333333333334,
2025
+ "grad_norm": 0.05290106683969498,
2026
+ "learning_rate": 5.68383822375278e-06,
2027
+ "loss": 0.042898637056350705,
2028
+ "mean_token_accuracy": 0.9792009994387627,
2029
+ "num_tokens": 20603651.0,
2030
+ "step": 4040
2031
+ },
2032
+ {
2033
+ "entropy": 0.0457917626015842,
2034
+ "epoch": 2.7066666666666666,
2035
+ "grad_norm": 0.0704483613371849,
2036
+ "learning_rate": 5.205593742249326e-06,
2037
+ "loss": 0.0423770546913147,
2038
+ "mean_token_accuracy": 0.9790433034300804,
2039
+ "num_tokens": 20705702.0,
2040
+ "step": 4060
2041
+ },
2042
+ {
2043
+ "entropy": 0.044912660401314496,
2044
+ "epoch": 2.7199999999999998,
2045
+ "grad_norm": 0.058434613049030304,
2046
+ "learning_rate": 4.747826075027506e-06,
2047
+ "loss": 0.04174522757530212,
2048
+ "mean_token_accuracy": 0.9795982718467713,
2049
+ "num_tokens": 20807336.0,
2050
+ "step": 4080
2051
+ },
2052
+ {
2053
+ "entropy": 0.045613402500748634,
2054
+ "epoch": 2.7333333333333334,
2055
+ "grad_norm": 0.08788046985864639,
2056
+ "learning_rate": 4.310634105807065e-06,
2057
+ "loss": 0.04344511330127716,
2058
+ "mean_token_accuracy": 0.9793641656637192,
2059
+ "num_tokens": 20909744.0,
2060
+ "step": 4100
2061
+ },
2062
+ {
2063
+ "entropy": 0.04498438341543078,
2064
+ "epoch": 2.7466666666666666,
2065
+ "grad_norm": 0.06054578721523285,
2066
+ "learning_rate": 3.894112273691697e-06,
2067
+ "loss": 0.041690278053283694,
2068
+ "mean_token_accuracy": 0.9799363717436791,
2069
+ "num_tokens": 21011520.0,
2070
+ "step": 4120
2071
+ },
2072
+ {
2073
+ "entropy": 0.04519799826666713,
2074
+ "epoch": 2.76,
2075
+ "grad_norm": 0.06741084903478622,
2076
+ "learning_rate": 3.4983505527688586e-06,
2077
+ "loss": 0.042607730627059935,
2078
+ "mean_token_accuracy": 0.979535199701786,
2079
+ "num_tokens": 21113638.0,
2080
+ "step": 4140
2081
+ },
2082
+ {
2083
+ "entropy": 0.04527061656117439,
2084
+ "epoch": 2.7733333333333334,
2085
+ "grad_norm": 0.053430285304784775,
2086
+ "learning_rate": 3.1234344326742657e-06,
2087
+ "loss": 0.04179522097110748,
2088
+ "mean_token_accuracy": 0.979697409272194,
2089
+ "num_tokens": 21215783.0,
2090
+ "step": 4160
2091
+ },
2092
+ {
2093
+ "entropy": 0.045730549935251476,
2094
+ "epoch": 2.7866666666666666,
2095
+ "grad_norm": 0.07262956351041794,
2096
+ "learning_rate": 2.7694449001250512e-06,
2097
+ "loss": 0.042841532826423646,
2098
+ "mean_token_accuracy": 0.9794132426381111,
2099
+ "num_tokens": 21317798.0,
2100
+ "step": 4180
2101
+ },
2102
+ {
2103
+ "entropy": 0.04552676072344184,
2104
+ "epoch": 2.8,
2105
+ "grad_norm": 0.06751976907253265,
2106
+ "learning_rate": 2.4364584214254695e-06,
2107
+ "loss": 0.04251702129840851,
2108
+ "mean_token_accuracy": 0.9793218955397606,
2109
+ "num_tokens": 21419787.0,
2110
+ "step": 4200
2111
+ },
2112
+ {
2113
+ "entropy": 0.045480293966829774,
2114
+ "epoch": 2.8133333333333335,
2115
+ "grad_norm": 0.0856935977935791,
2116
+ "learning_rate": 2.124546925949389e-06,
2117
+ "loss": 0.04228883981704712,
2118
+ "mean_token_accuracy": 0.9794924795627594,
2119
+ "num_tokens": 21521816.0,
2120
+ "step": 4220
2121
+ },
2122
+ {
2123
+ "entropy": 0.04522721925750375,
2124
+ "epoch": 2.8266666666666667,
2125
+ "grad_norm": 0.04721014201641083,
2126
+ "learning_rate": 1.8337777906023978e-06,
2127
+ "loss": 0.04205127358436585,
2128
+ "mean_token_accuracy": 0.9795928984880448,
2129
+ "num_tokens": 21623696.0,
2130
+ "step": 4240
2131
+ },
2132
+ {
2133
+ "entropy": 0.0451619129627943,
2134
+ "epoch": 2.84,
2135
+ "grad_norm": 0.06828150898218155,
2136
+ "learning_rate": 1.5642138252677019e-06,
2137
+ "loss": 0.041848546266555785,
2138
+ "mean_token_accuracy": 0.9796140640974045,
2139
+ "num_tokens": 21726066.0,
2140
+ "step": 4260
2141
+ },
2142
+ {
2143
+ "entropy": 0.04501318633556366,
2144
+ "epoch": 2.8533333333333335,
2145
+ "grad_norm": 0.08222071826457977,
2146
+ "learning_rate": 1.3159132592382772e-06,
2147
+ "loss": 0.04213366806507111,
2148
+ "mean_token_accuracy": 0.9795982599258423,
2149
+ "num_tokens": 21828178.0,
2150
+ "step": 4280
2151
+ },
2152
+ {
2153
+ "entropy": 0.0461537716910243,
2154
+ "epoch": 2.8666666666666667,
2155
+ "grad_norm": 0.0802520290017128,
2156
+ "learning_rate": 1.0889297286386102e-06,
2157
+ "loss": 0.04323468208312988,
2158
+ "mean_token_accuracy": 0.9791506737470627,
2159
+ "num_tokens": 21929963.0,
2160
+ "step": 4300
2161
+ },
2162
+ {
2163
+ "entropy": 0.04528212863951921,
2164
+ "epoch": 2.88,
2165
+ "grad_norm": 0.08974730968475342,
2166
+ "learning_rate": 8.833122648386871e-07,
2167
+ "loss": 0.042816996574401855,
2168
+ "mean_token_accuracy": 0.9789806365966797,
2169
+ "num_tokens": 22032092.0,
2170
+ "step": 4320
2171
+ },
2172
+ {
2173
+ "entropy": 0.045245842542499304,
2174
+ "epoch": 2.8933333333333335,
2175
+ "grad_norm": 0.05283057317137718,
2176
+ "learning_rate": 6.991052838624113e-07,
2177
+ "loss": 0.04174770712852478,
2178
+ "mean_token_accuracy": 0.9798634141683579,
2179
+ "num_tokens": 22134281.0,
2180
+ "step": 4340
2181
+ },
2182
+ {
2183
+ "entropy": 0.045284852758049964,
2184
+ "epoch": 2.9066666666666667,
2185
+ "grad_norm": 0.0722041130065918,
2186
+ "learning_rate": 5.363485767933663e-07,
2187
+ "loss": 0.041790124773979184,
2188
+ "mean_token_accuracy": 0.979168464243412,
2189
+ "num_tokens": 22236085.0,
2190
+ "step": 4360
2191
+ },
2192
+ {
2193
+ "entropy": 0.04504124140366912,
2194
+ "epoch": 2.92,
2195
+ "grad_norm": 0.06595401465892792,
2196
+ "learning_rate": 3.9507730117926967e-07,
2197
+ "loss": 0.04146735072135925,
2198
+ "mean_token_accuracy": 0.9801181107759476,
2199
+ "num_tokens": 22338053.0,
2200
+ "step": 4380
2201
+ },
2202
+ {
2203
+ "entropy": 0.04522117590531707,
2204
+ "epoch": 2.9333333333333336,
2205
+ "grad_norm": 0.06364521384239197,
2206
+ "learning_rate": 2.7532197343758115e-07,
2207
+ "loss": 0.04191155731678009,
2208
+ "mean_token_accuracy": 0.9794103637337684,
2209
+ "num_tokens": 22440208.0,
2210
+ "step": 4400
2211
+ },
2212
+ {
2213
+ "entropy": 0.045472448039799926,
2214
+ "epoch": 2.9466666666666668,
2215
+ "grad_norm": 0.0597660131752491,
2216
+ "learning_rate": 1.7710846226355328e-07,
2217
+ "loss": 0.04289998710155487,
2218
+ "mean_token_accuracy": 0.9792811706662178,
2219
+ "num_tokens": 22542219.0,
2220
+ "step": 4420
2221
+ },
2222
+ {
2223
+ "entropy": 0.04583751475438476,
2224
+ "epoch": 2.96,
2225
+ "grad_norm": 0.08572968095541,
2226
+ "learning_rate": 1.0045798304220145e-07,
2227
+ "loss": 0.0427745521068573,
2228
+ "mean_token_accuracy": 0.9792221873998642,
2229
+ "num_tokens": 22644025.0,
2230
+ "step": 4440
2231
+ },
2232
+ {
2233
+ "entropy": 0.04562570815905929,
2234
+ "epoch": 2.9733333333333336,
2235
+ "grad_norm": 0.0797945037484169,
2236
+ "learning_rate": 4.5387093265591986e-08,
2237
+ "loss": 0.04286653101444245,
2238
+ "mean_token_accuracy": 0.9792360305786133,
2239
+ "num_tokens": 22745968.0,
2240
+ "step": 4460
2241
+ },
2242
+ {
2243
+ "entropy": 0.045168190728873014,
2244
+ "epoch": 2.986666666666667,
2245
+ "grad_norm": 0.07274357974529266,
2246
+ "learning_rate": 1.1907688956136477e-08,
2247
+ "loss": 0.04201154708862305,
2248
+ "mean_token_accuracy": 0.9799786448478699,
2249
+ "num_tokens": 22848205.0,
2250
+ "step": 4480
2251
+ },
2252
+ {
2253
+ "entropy": 0.045816550869494675,
2254
+ "epoch": 3.0,
2255
+ "grad_norm": 0.06689723581075668,
2256
+ "learning_rate": 2.70020969361795e-11,
2257
+ "loss": 0.042978566884994504,
2258
+ "mean_token_accuracy": 0.9794494539499283,
2259
+ "num_tokens": 22950555.0,
2260
+ "step": 4500
2261
+ }
2262
+ ],
2263
+ "logging_steps": 20,
2264
+ "max_steps": 4500,
2265
+ "num_input_tokens_seen": 0,
2266
+ "num_train_epochs": 3,
2267
+ "save_steps": 500,
2268
+ "stateful_callbacks": {
2269
+ "TrainerControl": {
2270
+ "args": {
2271
+ "should_epoch_stop": false,
2272
+ "should_evaluate": false,
2273
+ "should_log": false,
2274
+ "should_save": true,
2275
+ "should_training_stop": true
2276
+ },
2277
+ "attributes": {}
2278
+ }
2279
+ },
2280
+ "total_flos": 1.5251560037074944e+16,
2281
+ "train_batch_size": 8,
2282
+ "trial_name": null,
2283
+ "trial_params": null
2284
+ }
checkpoint-4500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d5f0b1c8ce7c4a618e3140df4d2f8e00e0eb1af31e455fd9e89b510e83ad4ae
3
+ size 5585
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:daab2354f8a74e70d70b4d1f804939b68a8c9624dd06cb7858e52dd8970e9726
3
+ size 33384567
tokenizer_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<eos>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": false,
10
+ "local_files_only": false,
11
+ "mask_token": "<mask>",
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "model_specific_special_tokens": {
14
+ "boi_token": "<start_of_image>",
15
+ "eoi_token": "<end_of_image>",
16
+ "image_token": "<image_soft_token>"
17
+ },
18
+ "pad_token": "<pad>",
19
+ "padding_side": "left",
20
+ "sp_model_kwargs": null,
21
+ "spaces_between_special_tokens": false,
22
+ "tokenizer_class": "GemmaTokenizer",
23
+ "unk_token": "<unk>",
24
+ "use_default_system_prompt": false
25
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d5f0b1c8ce7c4a618e3140df4d2f8e00e0eb1af31e455fd9e89b510e83ad4ae
3
+ size 5585