RohithMidigudla commited on
Commit
e28bc94
·
verified ·
1 Parent(s): 9f52bec

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +56 -0
  2. README.md +63 -0
  3. adapter_config.json +52 -0
  4. adapter_model.safetensors +3 -0
  5. chat_template.jinja +351 -0
  6. checkpoint-100/README.md +210 -0
  7. checkpoint-100/adapter_config.json +52 -0
  8. checkpoint-100/adapter_model.safetensors +3 -0
  9. checkpoint-100/chat_template.jinja +351 -0
  10. checkpoint-100/optimizer.pt +3 -0
  11. checkpoint-100/processor_config.json +75 -0
  12. checkpoint-100/rng_state.pth +3 -0
  13. checkpoint-100/scheduler.pt +3 -0
  14. checkpoint-100/tokenizer.json +3 -0
  15. checkpoint-100/tokenizer_config.json +289 -0
  16. checkpoint-100/trainer_state.json +182 -0
  17. checkpoint-100/training_args.bin +3 -0
  18. checkpoint-1000/README.md +210 -0
  19. checkpoint-1000/adapter_config.json +52 -0
  20. checkpoint-1000/adapter_model.safetensors +3 -0
  21. checkpoint-1000/chat_template.jinja +351 -0
  22. checkpoint-1000/optimizer.pt +3 -0
  23. checkpoint-1000/processor_config.json +75 -0
  24. checkpoint-1000/rng_state.pth +3 -0
  25. checkpoint-1000/scheduler.pt +3 -0
  26. checkpoint-1000/tokenizer.json +3 -0
  27. checkpoint-1000/tokenizer_config.json +289 -0
  28. checkpoint-1000/trainer_state.json +1442 -0
  29. checkpoint-1000/training_args.bin +3 -0
  30. checkpoint-1100/README.md +210 -0
  31. checkpoint-1100/adapter_config.json +52 -0
  32. checkpoint-1100/adapter_model.safetensors +3 -0
  33. checkpoint-1100/chat_template.jinja +351 -0
  34. checkpoint-1100/optimizer.pt +3 -0
  35. checkpoint-1100/processor_config.json +75 -0
  36. checkpoint-1100/rng_state.pth +3 -0
  37. checkpoint-1100/scheduler.pt +3 -0
  38. checkpoint-1100/tokenizer.json +3 -0
  39. checkpoint-1100/tokenizer_config.json +289 -0
  40. checkpoint-1100/trainer_state.json +1582 -0
  41. checkpoint-1100/training_args.bin +3 -0
  42. checkpoint-1200/README.md +210 -0
  43. checkpoint-1200/adapter_config.json +52 -0
  44. checkpoint-1200/adapter_model.safetensors +3 -0
  45. checkpoint-1200/chat_template.jinja +351 -0
  46. checkpoint-1200/optimizer.pt +3 -0
  47. checkpoint-1200/processor_config.json +75 -0
  48. checkpoint-1200/rng_state.pth +3 -0
  49. checkpoint-1200/scheduler.pt +3 -0
  50. checkpoint-1200/tokenizer.json +3 -0
.gitattributes CHANGED
@@ -34,3 +34,59 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  last-checkpoint/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-1100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-1600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ checkpoint-1700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ checkpoint-1800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ checkpoint-1900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ checkpoint-2100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ checkpoint-2200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ checkpoint-2300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
53
+ checkpoint-2400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
54
+ checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ checkpoint-2600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
+ checkpoint-2700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
57
+ checkpoint-2800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
+ checkpoint-2900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
59
+ checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
60
+ checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
61
+ checkpoint-3100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
62
+ checkpoint-3200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
63
+ checkpoint-3300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
64
+ checkpoint-3400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
65
+ checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
66
+ checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
67
+ checkpoint-3700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
68
+ checkpoint-3800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
69
+ checkpoint-3900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
70
+ checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
71
+ checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
72
+ checkpoint-4100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
73
+ checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
74
+ checkpoint-4300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
75
+ checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
76
+ checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
77
+ checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
78
+ checkpoint-4700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
79
+ checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
80
+ checkpoint-4900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
81
+ checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
82
+ checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
83
+ checkpoint-5100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
84
+ checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
85
+ checkpoint-5300/tokenizer.json filter=lfs diff=lfs merge=lfs -text
86
+ checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
87
+ checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
88
+ checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
89
+ checkpoint-700/tokenizer.json filter=lfs diff=lfs merge=lfs -text
90
+ checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
91
+ checkpoint-900/tokenizer.json filter=lfs diff=lfs merge=lfs -text
92
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/gemma-4-E4B-it
3
+ library_name: peft
4
+ model_name: telugu
5
+ tags:
6
+ - base_model:adapter:unsloth/gemma-4-E4B-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ licence: license
13
+ pipeline_tag: text-generation
14
+ ---
15
+
16
+ # Model Card for telugu
17
+
18
+ This model is a fine-tuned version of [unsloth/gemma-4-E4B-it](https://huggingface.co/unsloth/gemma-4-E4B-it).
19
+ It has been trained using [TRL](https://github.com/huggingface/trl).
20
+
21
+ ## Quick start
22
+
23
+ ```python
24
+ from transformers import pipeline
25
+
26
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
27
+ generator = pipeline("text-generation", model="None", device="cuda")
28
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
29
+ print(output["generated_text"])
30
+ ```
31
+
32
+ ## Training procedure
33
+
34
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/rohithsaimidigudla-omnisynkai/gemma-health-adapters/runs/mwc9jt0z)
35
+
36
+
37
+ This model was trained with SFT.
38
+
39
+ ### Framework versions
40
+
41
+ - PEFT 0.19.1
42
+ - TRL: 0.19.1
43
+ - Transformers: 5.5.0
44
+ - Pytorch: 2.7.0+cu128
45
+ - Datasets: 3.6.0
46
+ - Tokenizers: 0.22.2
47
+
48
+ ## Citations
49
+
50
+
51
+
52
+ Cite TRL as:
53
+
54
+ ```bibtex
55
+ @misc{vonwerra2022trl,
56
+ title = {{TRL: Transformer Reinforcement Learning}},
57
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
58
+ year = 2020,
59
+ journal = {GitHub repository},
60
+ publisher = {GitHub},
61
+ howpublished = {\url{https://github.com/huggingface/trl}}
62
+ }
63
+ ```
adapter_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma4ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma4.modeling_gemma4",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 16,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "lora_ga_config": null,
27
+ "megatron_config": null,
28
+ "megatron_core": "megatron.core",
29
+ "modules_to_save": null,
30
+ "peft_type": "LORA",
31
+ "peft_version": "0.19.1",
32
+ "qalora_group_size": 16,
33
+ "r": 16,
34
+ "rank_pattern": {},
35
+ "revision": null,
36
+ "target_modules": [
37
+ "gate_proj",
38
+ "v_proj",
39
+ "o_proj",
40
+ "k_proj",
41
+ "up_proj",
42
+ "down_proj",
43
+ "q_proj"
44
+ ],
45
+ "target_parameters": null,
46
+ "task_type": "CAUSAL_LM",
47
+ "trainable_token_indices": null,
48
+ "use_bdlora": null,
49
+ "use_dora": false,
50
+ "use_qalora": false,
51
+ "use_rslora": false
52
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:023fcb9c596c99c5e8d74320f9720621834918ec3bcd5d877b44b0fe0907ce2e
3
+ size 169741912
chat_template.jinja ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required, filter_keys=false) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- set add_comma = false -%}
6
+ {%- if not filter_keys or key not in standard_keys -%}
7
+ {%- if ns.found_first %},{% endif -%}
8
+ {%- set ns.found_first = true -%}
9
+ {{ key }}:{
10
+ {%- if value['description'] -%}
11
+ description:<|"|>{{ value['description'] }}<|"|>
12
+ {%- set add_comma = true -%}
13
+ {%- endif -%}
14
+ {%- if value['type'] | upper == 'STRING' -%}
15
+ {%- if value['enum'] -%}
16
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
+ enum:{{ format_argument(value['enum']) }}
18
+ {%- endif -%}
19
+ {%- elif value['type'] | upper == 'ARRAY' -%}
20
+ {%- if value['items'] is mapping and value['items'] -%}
21
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
+ items:{
23
+ {%- set ns_items = namespace(found_first=false) -%}
24
+ {%- for item_key, item_value in value['items'] | dictsort -%}
25
+ {%- if item_value is not none -%}
26
+ {%- if ns_items.found_first %},{% endif -%}
27
+ {%- set ns_items.found_first = true -%}
28
+ {%- if item_key == 'properties' -%}
29
+ properties:{
30
+ {%- if item_value is mapping -%}
31
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
32
+ {%- endif -%}
33
+ }
34
+ {%- elif item_key == 'required' -%}
35
+ required:[
36
+ {%- for req_item in item_value -%}
37
+ <|"|>{{- req_item -}}<|"|>
38
+ {%- if not loop.last %},{% endif -%}
39
+ {%- endfor -%}
40
+ ]
41
+ {%- elif item_key == 'type' -%}
42
+ {%- if item_value is string -%}
43
+ type:{{ format_argument(item_value | upper) }}
44
+ {%- else -%}
45
+ type:{{ format_argument(item_value | map('upper') | list) }}
46
+ {%- endif -%}
47
+ {%- else -%}
48
+ {{ item_key }}:{{ format_argument(item_value) }}
49
+ {%- endif -%}
50
+ {%- endif -%}
51
+ {%- endfor -%}
52
+ }
53
+ {%- endif -%}
54
+ {%- endif -%}
55
+ {%- if value['nullable'] %}
56
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
+ nullable:true
58
+ {%- endif -%}
59
+ {%- if value['type'] | upper == 'OBJECT' -%}
60
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
+ properties:{
63
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
+ }
65
+ {%- elif value is mapping -%}
66
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
+ properties:{
68
+ {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
69
+ }
70
+ {%- endif -%}
71
+ {%- if value['required'] -%}
72
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
+ required:[
74
+ {%- for item in value['required'] | default([]) -%}
75
+ <|"|>{{- item -}}<|"|>
76
+ {%- if not loop.last %},{% endif -%}
77
+ {%- endfor -%}
78
+ ]
79
+ {%- endif -%}
80
+ {%- endif -%}
81
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
+ type:<|"|>{{ value['type'] | upper }}<|"|>}
83
+ {%- endif -%}
84
+ {%- endfor -%}
85
+ {%- endmacro -%}
86
+ {%- macro format_function_declaration(tool_data) -%}
87
+ declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
88
+ {%- set params = tool_data['function']['parameters'] -%}
89
+ {%- if params -%}
90
+ ,parameters:{
91
+ {%- if params['properties'] -%}
92
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
93
+ {%- endif -%}
94
+ {%- if params['required'] -%}
95
+ required:[
96
+ {%- for item in params['required'] -%}
97
+ <|"|>{{- item -}}<|"|>
98
+ {{- ',' if not loop.last -}}
99
+ {%- endfor -%}
100
+ ],
101
+ {%- endif -%}
102
+ {%- if params['type'] -%}
103
+ type:<|"|>{{- params['type'] | upper -}}<|"|>}
104
+ {%- endif -%}
105
+ {%- endif -%}
106
+ {%- if 'response' in tool_data['function'] -%}
107
+ {%- set response_declaration = tool_data['function']['response'] -%}
108
+ ,response:{
109
+ {%- if response_declaration['description'] -%}
110
+ description:<|"|>{{- response_declaration['description'] -}}<|"|>,
111
+ {%- endif -%}
112
+ {%- if response_declaration['type'] | upper == 'OBJECT' -%}
113
+ type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
114
+ {%- endif -%}
115
+ {%- endif -%}
116
+ }
117
+ {%- endmacro -%}
118
+ {%- macro format_argument(argument, escape_keys=True) -%}
119
+ {%- if argument is string -%}
120
+ {{- '<|"|>' + argument + '<|"|>' -}}
121
+ {%- elif argument is boolean -%}
122
+ {{- 'true' if argument else 'false' -}}
123
+ {%- elif argument is mapping -%}
124
+ {{- '{' -}}
125
+ {%- set ns = namespace(found_first=false) -%}
126
+ {%- for key, value in argument | dictsort -%}
127
+ {%- if ns.found_first %},{% endif -%}
128
+ {%- set ns.found_first = true -%}
129
+ {%- if escape_keys -%}
130
+ {{- '<|"|>' + key + '<|"|>' -}}
131
+ {%- else -%}
132
+ {{- key -}}
133
+ {%- endif -%}
134
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
135
+ {%- endfor -%}
136
+ {{- '}' -}}
137
+ {%- elif argument is sequence -%}
138
+ {{- '[' -}}
139
+ {%- for item in argument -%}
140
+ {{- format_argument(item, escape_keys=escape_keys) -}}
141
+ {%- if not loop.last %},{% endif -%}
142
+ {%- endfor -%}
143
+ {{- ']' -}}
144
+ {%- else -%}
145
+ {{- argument -}}
146
+ {%- endif -%}
147
+ {%- endmacro -%}
148
+ {%- macro strip_thinking(text) -%}
149
+ {%- set ns = namespace(result='') -%}
150
+ {%- for part in text.split('<channel|>') -%}
151
+ {%- if '<|channel>' in part -%}
152
+ {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
153
+ {%- else -%}
154
+ {%- set ns.result = ns.result + part -%}
155
+ {%- endif -%}
156
+ {%- endfor -%}
157
+ {{- ns.result | trim -}}
158
+ {%- endmacro -%}
159
+
160
+ {%- macro format_tool_response_block(tool_name, response) -%}
161
+ {{- '<|tool_response>' -}}
162
+ {%- if response is mapping -%}
163
+ {{- 'response:' + tool_name + '{' -}}
164
+ {%- for key, value in response | dictsort -%}
165
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
+ {%- if not loop.last %},{% endif -%}
167
+ {%- endfor -%}
168
+ {{- '}' -}}
169
+ {%- else -%}
170
+ {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
+ {%- endif -%}
172
+ {{- '<tool_response|>' -}}
173
+ {%- endmacro -%}
174
+
175
+ {%- set ns = namespace(prev_message_type=None) -%}
176
+ {%- set loop_messages = messages -%}
177
+ {{- bos_token -}}
178
+ {#- Handle System/Tool Definitions Block -#}
179
+ {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
+ {{- '<|turn>system\n' -}}
181
+ {#- Inject Thinking token at the very top of the FIRST system turn -#}
182
+ {%- if enable_thinking is defined and enable_thinking -%}
183
+ {{- '<|think|>\n' -}}
184
+ {%- set ns.prev_message_type = 'think' -%}
185
+ {%- endif -%}
186
+ {%- if messages[0]['role'] in ['system', 'developer'] -%}
187
+ {%- if messages[0]['content'] is string -%}
188
+ {{- messages[0]['content'] | trim -}}
189
+ {%- elif messages[0]['content'] is sequence -%}
190
+ {%- for item in messages[0]['content'] -%}
191
+ {{- item['text'] | trim + ' '-}}
192
+ {%- endfor -%}
193
+ {%- endif -%}
194
+ {%- set loop_messages = messages[1:] -%}
195
+ {%- endif -%}
196
+ {%- if tools -%}
197
+ {%- for tool in tools %}
198
+ {{- '<|tool>' -}}
199
+ {{- format_function_declaration(tool) | trim -}}
200
+ {{- '<tool|>' -}}
201
+ {%- endfor %}
202
+ {%- set ns.prev_message_type = 'tool' -%}
203
+ {%- endif -%}
204
+ {{- '<turn|>\n' -}}
205
+ {%- endif %}
206
+
207
+ {#- Pre-scan: find last user message index for reasoning guard -#}
208
+ {%- set ns_turn = namespace(last_user_idx=-1) -%}
209
+ {%- for i in range(loop_messages | length) -%}
210
+ {%- if loop_messages[i]['role'] == 'user' -%}
211
+ {%- set ns_turn.last_user_idx = i -%}
212
+ {%- endif -%}
213
+ {%- endfor -%}
214
+
215
+ {#- Loop through messages -#}
216
+ {%- for message in loop_messages -%}
217
+ {%- if message['role'] != 'tool' -%}
218
+ {%- set ns.prev_message_type = None -%}
219
+ {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
220
+ {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
221
+ {%- set prev_nt = namespace(role=None, found=false) -%}
222
+ {%- if loop.index0 > 0 -%}
223
+ {%- for j in range(loop.index0 - 1, -1, -1) -%}
224
+ {%- if not prev_nt.found -%}
225
+ {%- if loop_messages[j]['role'] != 'tool' -%}
226
+ {%- set prev_nt.role = loop_messages[j]['role'] -%}
227
+ {%- set prev_nt.found = true -%}
228
+ {%- endif -%}
229
+ {%- endif -%}
230
+ {%- endfor -%}
231
+ {%- endif -%}
232
+ {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
233
+ {%- if not continue_same_model_turn -%}
234
+ {{- '<|turn>' + role + '\n' }}
235
+ {%- endif -%}
236
+
237
+ {#- Render reasoning/reasoning_content as thinking channel -#}
238
+ {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
239
+ {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
240
+ {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
241
+ {%- endif -%}
242
+
243
+ {%- if message['tool_calls'] -%}
244
+ {%- for tool_call in message['tool_calls'] -%}
245
+ {%- set function = tool_call['function'] -%}
246
+ {{- '<|tool_call>call:' + function['name'] + '{' -}}
247
+ {%- if function['arguments'] is mapping -%}
248
+ {%- set ns_args = namespace(found_first=false) -%}
249
+ {%- for key, value in function['arguments'] | dictsort -%}
250
+ {%- if ns_args.found_first %},{% endif -%}
251
+ {%- set ns_args.found_first = true -%}
252
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
253
+ {%- endfor -%}
254
+ {%- elif function['arguments'] is string -%}
255
+ {{- function['arguments'] -}}
256
+ {%- endif -%}
257
+ {{- '}<tool_call|>' -}}
258
+ {%- endfor -%}
259
+ {%- set ns.prev_message_type = 'tool_call' -%}
260
+ {%- endif -%}
261
+
262
+ {%- set ns_tr_out = namespace(flag=false) -%}
263
+ {%- if message.get('tool_responses') -%}
264
+ {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
265
+ {%- for tool_response in message['tool_responses'] -%}
266
+ {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
267
+ {%- set ns_tr_out.flag = true -%}
268
+ {%- set ns.prev_message_type = 'tool_response' -%}
269
+ {%- endfor -%}
270
+ {%- elif message.get('tool_calls') -%}
271
+ {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
272
+ {%- set ns_tool_scan = namespace(stopped=false) -%}
273
+ {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
274
+ {%- if ns_tool_scan.stopped -%}
275
+ {%- elif loop_messages[k]['role'] != 'tool' -%}
276
+ {%- set ns_tool_scan.stopped = true -%}
277
+ {%- else -%}
278
+ {%- set follow = loop_messages[k] -%}
279
+ {#- Resolve tool_call_id to function name -#}
280
+ {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
281
+ {%- for tc in message['tool_calls'] -%}
282
+ {%- if tc.get('id') == follow.get('tool_call_id') -%}
283
+ {%- set ns_tname.name = tc['function']['name'] -%}
284
+ {%- endif -%}
285
+ {%- endfor -%}
286
+ {#- Handle content as string or content-parts array -#}
287
+ {%- set tool_body = follow.get('content') -%}
288
+ {%- if tool_body is string -%}
289
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
290
+ {%- elif tool_body is sequence and tool_body is not string -%}
291
+ {%- set ns_txt = namespace(s='') -%}
292
+ {%- for part in tool_body -%}
293
+ {%- if part.get('type') == 'text' -%}
294
+ {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
295
+ {%- endif -%}
296
+ {%- endfor -%}
297
+ {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
298
+ {%- else -%}
299
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
300
+ {%- endif -%}
301
+ {%- set ns_tr_out.flag = true -%}
302
+ {%- set ns.prev_message_type = 'tool_response' -%}
303
+ {%- endif -%}
304
+ {%- endfor -%}
305
+ {%- endif -%}
306
+
307
+ {%- set captured_content -%}
308
+ {%- if message['content'] is string -%}
309
+ {%- if role == 'model' -%}
310
+ {{- strip_thinking(message['content']) -}}
311
+ {%- else -%}
312
+ {{- message['content'] | trim -}}
313
+ {%- endif -%}
314
+ {%- elif message['content'] is sequence -%}
315
+ {%- for item in message['content'] -%}
316
+ {%- if item['type'] == 'text' -%}
317
+ {%- if role == 'model' -%}
318
+ {{- strip_thinking(item['text']) -}}
319
+ {%- else -%}
320
+ {{- item['text'] | trim -}}
321
+ {%- endif -%}
322
+ {%- elif item['type'] == 'image' -%}
323
+ {{- '<|image|>' -}}
324
+ {%- set ns.prev_message_type = 'image' -%}
325
+ {%- elif item['type'] == 'audio' -%}
326
+ {{- '<|audio|>' -}}
327
+ {%- set ns.prev_message_type = 'audio' -%}
328
+ {%- elif item['type'] == 'video' -%}
329
+ {{- '<|video|>' -}}
330
+ {%- set ns.prev_message_type = 'video' -%}
331
+ {%- endif -%}
332
+ {%- endfor -%}
333
+ {%- endif -%}
334
+ {%- endset -%}
335
+
336
+ {{- captured_content -}}
337
+ {%- set has_content = captured_content | trim | length > 0 -%}
338
+
339
+ {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
340
+ {{- '<|tool_response>' -}}
341
+ {%- elif not (ns_tr_out.flag and not has_content) -%}
342
+ {{- '<turn|>\n' -}}
343
+ {%- endif -%}
344
+ {%- endif -%}
345
+ {%- endfor -%}
346
+
347
+ {%- if add_generation_prompt -%}
348
+ {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
349
+ {{- '<|turn>model\n' -}}
350
+ {%- endif -%}
351
+ {%- endif -%}
checkpoint-100/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/gemma-4-E4B-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/gemma-4-E4B-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.19.1
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma4ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma4.modeling_gemma4",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 16,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "lora_ga_config": null,
27
+ "megatron_config": null,
28
+ "megatron_core": "megatron.core",
29
+ "modules_to_save": null,
30
+ "peft_type": "LORA",
31
+ "peft_version": "0.19.1",
32
+ "qalora_group_size": 16,
33
+ "r": 16,
34
+ "rank_pattern": {},
35
+ "revision": null,
36
+ "target_modules": [
37
+ "gate_proj",
38
+ "v_proj",
39
+ "o_proj",
40
+ "k_proj",
41
+ "up_proj",
42
+ "down_proj",
43
+ "q_proj"
44
+ ],
45
+ "target_parameters": null,
46
+ "task_type": "CAUSAL_LM",
47
+ "trainable_token_indices": null,
48
+ "use_bdlora": null,
49
+ "use_dora": false,
50
+ "use_qalora": false,
51
+ "use_rslora": false
52
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a0d76b0ebb45ec68a37d642d7342c66a7ebc9bc3239f3387972226f24509e56
3
+ size 169741912
checkpoint-100/chat_template.jinja ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required, filter_keys=false) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- set add_comma = false -%}
6
+ {%- if not filter_keys or key not in standard_keys -%}
7
+ {%- if ns.found_first %},{% endif -%}
8
+ {%- set ns.found_first = true -%}
9
+ {{ key }}:{
10
+ {%- if value['description'] -%}
11
+ description:<|"|>{{ value['description'] }}<|"|>
12
+ {%- set add_comma = true -%}
13
+ {%- endif -%}
14
+ {%- if value['type'] | upper == 'STRING' -%}
15
+ {%- if value['enum'] -%}
16
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
+ enum:{{ format_argument(value['enum']) }}
18
+ {%- endif -%}
19
+ {%- elif value['type'] | upper == 'ARRAY' -%}
20
+ {%- if value['items'] is mapping and value['items'] -%}
21
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
+ items:{
23
+ {%- set ns_items = namespace(found_first=false) -%}
24
+ {%- for item_key, item_value in value['items'] | dictsort -%}
25
+ {%- if item_value is not none -%}
26
+ {%- if ns_items.found_first %},{% endif -%}
27
+ {%- set ns_items.found_first = true -%}
28
+ {%- if item_key == 'properties' -%}
29
+ properties:{
30
+ {%- if item_value is mapping -%}
31
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
32
+ {%- endif -%}
33
+ }
34
+ {%- elif item_key == 'required' -%}
35
+ required:[
36
+ {%- for req_item in item_value -%}
37
+ <|"|>{{- req_item -}}<|"|>
38
+ {%- if not loop.last %},{% endif -%}
39
+ {%- endfor -%}
40
+ ]
41
+ {%- elif item_key == 'type' -%}
42
+ {%- if item_value is string -%}
43
+ type:{{ format_argument(item_value | upper) }}
44
+ {%- else -%}
45
+ type:{{ format_argument(item_value | map('upper') | list) }}
46
+ {%- endif -%}
47
+ {%- else -%}
48
+ {{ item_key }}:{{ format_argument(item_value) }}
49
+ {%- endif -%}
50
+ {%- endif -%}
51
+ {%- endfor -%}
52
+ }
53
+ {%- endif -%}
54
+ {%- endif -%}
55
+ {%- if value['nullable'] %}
56
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
+ nullable:true
58
+ {%- endif -%}
59
+ {%- if value['type'] | upper == 'OBJECT' -%}
60
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
+ properties:{
63
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
+ }
65
+ {%- elif value is mapping -%}
66
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
+ properties:{
68
+ {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
69
+ }
70
+ {%- endif -%}
71
+ {%- if value['required'] -%}
72
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
+ required:[
74
+ {%- for item in value['required'] | default([]) -%}
75
+ <|"|>{{- item -}}<|"|>
76
+ {%- if not loop.last %},{% endif -%}
77
+ {%- endfor -%}
78
+ ]
79
+ {%- endif -%}
80
+ {%- endif -%}
81
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
+ type:<|"|>{{ value['type'] | upper }}<|"|>}
83
+ {%- endif -%}
84
+ {%- endfor -%}
85
+ {%- endmacro -%}
86
+ {%- macro format_function_declaration(tool_data) -%}
87
+ declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
88
+ {%- set params = tool_data['function']['parameters'] -%}
89
+ {%- if params -%}
90
+ ,parameters:{
91
+ {%- if params['properties'] -%}
92
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
93
+ {%- endif -%}
94
+ {%- if params['required'] -%}
95
+ required:[
96
+ {%- for item in params['required'] -%}
97
+ <|"|>{{- item -}}<|"|>
98
+ {{- ',' if not loop.last -}}
99
+ {%- endfor -%}
100
+ ],
101
+ {%- endif -%}
102
+ {%- if params['type'] -%}
103
+ type:<|"|>{{- params['type'] | upper -}}<|"|>}
104
+ {%- endif -%}
105
+ {%- endif -%}
106
+ {%- if 'response' in tool_data['function'] -%}
107
+ {%- set response_declaration = tool_data['function']['response'] -%}
108
+ ,response:{
109
+ {%- if response_declaration['description'] -%}
110
+ description:<|"|>{{- response_declaration['description'] -}}<|"|>,
111
+ {%- endif -%}
112
+ {%- if response_declaration['type'] | upper == 'OBJECT' -%}
113
+ type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
114
+ {%- endif -%}
115
+ {%- endif -%}
116
+ }
117
+ {%- endmacro -%}
118
+ {%- macro format_argument(argument, escape_keys=True) -%}
119
+ {%- if argument is string -%}
120
+ {{- '<|"|>' + argument + '<|"|>' -}}
121
+ {%- elif argument is boolean -%}
122
+ {{- 'true' if argument else 'false' -}}
123
+ {%- elif argument is mapping -%}
124
+ {{- '{' -}}
125
+ {%- set ns = namespace(found_first=false) -%}
126
+ {%- for key, value in argument | dictsort -%}
127
+ {%- if ns.found_first %},{% endif -%}
128
+ {%- set ns.found_first = true -%}
129
+ {%- if escape_keys -%}
130
+ {{- '<|"|>' + key + '<|"|>' -}}
131
+ {%- else -%}
132
+ {{- key -}}
133
+ {%- endif -%}
134
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
135
+ {%- endfor -%}
136
+ {{- '}' -}}
137
+ {%- elif argument is sequence -%}
138
+ {{- '[' -}}
139
+ {%- for item in argument -%}
140
+ {{- format_argument(item, escape_keys=escape_keys) -}}
141
+ {%- if not loop.last %},{% endif -%}
142
+ {%- endfor -%}
143
+ {{- ']' -}}
144
+ {%- else -%}
145
+ {{- argument -}}
146
+ {%- endif -%}
147
+ {%- endmacro -%}
148
+ {%- macro strip_thinking(text) -%}
149
+ {%- set ns = namespace(result='') -%}
150
+ {%- for part in text.split('<channel|>') -%}
151
+ {%- if '<|channel>' in part -%}
152
+ {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
153
+ {%- else -%}
154
+ {%- set ns.result = ns.result + part -%}
155
+ {%- endif -%}
156
+ {%- endfor -%}
157
+ {{- ns.result | trim -}}
158
+ {%- endmacro -%}
159
+
160
+ {%- macro format_tool_response_block(tool_name, response) -%}
161
+ {{- '<|tool_response>' -}}
162
+ {%- if response is mapping -%}
163
+ {{- 'response:' + tool_name + '{' -}}
164
+ {%- for key, value in response | dictsort -%}
165
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
+ {%- if not loop.last %},{% endif -%}
167
+ {%- endfor -%}
168
+ {{- '}' -}}
169
+ {%- else -%}
170
+ {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
+ {%- endif -%}
172
+ {{- '<tool_response|>' -}}
173
+ {%- endmacro -%}
174
+
175
+ {%- set ns = namespace(prev_message_type=None) -%}
176
+ {%- set loop_messages = messages -%}
177
+ {{- bos_token -}}
178
+ {#- Handle System/Tool Definitions Block -#}
179
+ {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
+ {{- '<|turn>system\n' -}}
181
+ {#- Inject Thinking token at the very top of the FIRST system turn -#}
182
+ {%- if enable_thinking is defined and enable_thinking -%}
183
+ {{- '<|think|>\n' -}}
184
+ {%- set ns.prev_message_type = 'think' -%}
185
+ {%- endif -%}
186
+ {%- if messages[0]['role'] in ['system', 'developer'] -%}
187
+ {%- if messages[0]['content'] is string -%}
188
+ {{- messages[0]['content'] | trim -}}
189
+ {%- elif messages[0]['content'] is sequence -%}
190
+ {%- for item in messages[0]['content'] -%}
191
+ {{- item['text'] | trim + ' '-}}
192
+ {%- endfor -%}
193
+ {%- endif -%}
194
+ {%- set loop_messages = messages[1:] -%}
195
+ {%- endif -%}
196
+ {%- if tools -%}
197
+ {%- for tool in tools %}
198
+ {{- '<|tool>' -}}
199
+ {{- format_function_declaration(tool) | trim -}}
200
+ {{- '<tool|>' -}}
201
+ {%- endfor %}
202
+ {%- set ns.prev_message_type = 'tool' -%}
203
+ {%- endif -%}
204
+ {{- '<turn|>\n' -}}
205
+ {%- endif %}
206
+
207
+ {#- Pre-scan: find last user message index for reasoning guard -#}
208
+ {%- set ns_turn = namespace(last_user_idx=-1) -%}
209
+ {%- for i in range(loop_messages | length) -%}
210
+ {%- if loop_messages[i]['role'] == 'user' -%}
211
+ {%- set ns_turn.last_user_idx = i -%}
212
+ {%- endif -%}
213
+ {%- endfor -%}
214
+
215
+ {#- Loop through messages -#}
216
+ {%- for message in loop_messages -%}
217
+ {%- if message['role'] != 'tool' -%}
218
+ {%- set ns.prev_message_type = None -%}
219
+ {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
220
+ {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
221
+ {%- set prev_nt = namespace(role=None, found=false) -%}
222
+ {%- if loop.index0 > 0 -%}
223
+ {%- for j in range(loop.index0 - 1, -1, -1) -%}
224
+ {%- if not prev_nt.found -%}
225
+ {%- if loop_messages[j]['role'] != 'tool' -%}
226
+ {%- set prev_nt.role = loop_messages[j]['role'] -%}
227
+ {%- set prev_nt.found = true -%}
228
+ {%- endif -%}
229
+ {%- endif -%}
230
+ {%- endfor -%}
231
+ {%- endif -%}
232
+ {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
233
+ {%- if not continue_same_model_turn -%}
234
+ {{- '<|turn>' + role + '\n' }}
235
+ {%- endif -%}
236
+
237
+ {#- Render reasoning/reasoning_content as thinking channel -#}
238
+ {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
239
+ {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
240
+ {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
241
+ {%- endif -%}
242
+
243
+ {%- if message['tool_calls'] -%}
244
+ {%- for tool_call in message['tool_calls'] -%}
245
+ {%- set function = tool_call['function'] -%}
246
+ {{- '<|tool_call>call:' + function['name'] + '{' -}}
247
+ {%- if function['arguments'] is mapping -%}
248
+ {%- set ns_args = namespace(found_first=false) -%}
249
+ {%- for key, value in function['arguments'] | dictsort -%}
250
+ {%- if ns_args.found_first %},{% endif -%}
251
+ {%- set ns_args.found_first = true -%}
252
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
253
+ {%- endfor -%}
254
+ {%- elif function['arguments'] is string -%}
255
+ {{- function['arguments'] -}}
256
+ {%- endif -%}
257
+ {{- '}<tool_call|>' -}}
258
+ {%- endfor -%}
259
+ {%- set ns.prev_message_type = 'tool_call' -%}
260
+ {%- endif -%}
261
+
262
+ {%- set ns_tr_out = namespace(flag=false) -%}
263
+ {%- if message.get('tool_responses') -%}
264
+ {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
265
+ {%- for tool_response in message['tool_responses'] -%}
266
+ {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
267
+ {%- set ns_tr_out.flag = true -%}
268
+ {%- set ns.prev_message_type = 'tool_response' -%}
269
+ {%- endfor -%}
270
+ {%- elif message.get('tool_calls') -%}
271
+ {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
272
+ {%- set ns_tool_scan = namespace(stopped=false) -%}
273
+ {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
274
+ {%- if ns_tool_scan.stopped -%}
275
+ {%- elif loop_messages[k]['role'] != 'tool' -%}
276
+ {%- set ns_tool_scan.stopped = true -%}
277
+ {%- else -%}
278
+ {%- set follow = loop_messages[k] -%}
279
+ {#- Resolve tool_call_id to function name -#}
280
+ {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
281
+ {%- for tc in message['tool_calls'] -%}
282
+ {%- if tc.get('id') == follow.get('tool_call_id') -%}
283
+ {%- set ns_tname.name = tc['function']['name'] -%}
284
+ {%- endif -%}
285
+ {%- endfor -%}
286
+ {#- Handle content as string or content-parts array -#}
287
+ {%- set tool_body = follow.get('content') -%}
288
+ {%- if tool_body is string -%}
289
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
290
+ {%- elif tool_body is sequence and tool_body is not string -%}
291
+ {%- set ns_txt = namespace(s='') -%}
292
+ {%- for part in tool_body -%}
293
+ {%- if part.get('type') == 'text' -%}
294
+ {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
295
+ {%- endif -%}
296
+ {%- endfor -%}
297
+ {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
298
+ {%- else -%}
299
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
300
+ {%- endif -%}
301
+ {%- set ns_tr_out.flag = true -%}
302
+ {%- set ns.prev_message_type = 'tool_response' -%}
303
+ {%- endif -%}
304
+ {%- endfor -%}
305
+ {%- endif -%}
306
+
307
+ {%- set captured_content -%}
308
+ {%- if message['content'] is string -%}
309
+ {%- if role == 'model' -%}
310
+ {{- strip_thinking(message['content']) -}}
311
+ {%- else -%}
312
+ {{- message['content'] | trim -}}
313
+ {%- endif -%}
314
+ {%- elif message['content'] is sequence -%}
315
+ {%- for item in message['content'] -%}
316
+ {%- if item['type'] == 'text' -%}
317
+ {%- if role == 'model' -%}
318
+ {{- strip_thinking(item['text']) -}}
319
+ {%- else -%}
320
+ {{- item['text'] | trim -}}
321
+ {%- endif -%}
322
+ {%- elif item['type'] == 'image' -%}
323
+ {{- '<|image|>' -}}
324
+ {%- set ns.prev_message_type = 'image' -%}
325
+ {%- elif item['type'] == 'audio' -%}
326
+ {{- '<|audio|>' -}}
327
+ {%- set ns.prev_message_type = 'audio' -%}
328
+ {%- elif item['type'] == 'video' -%}
329
+ {{- '<|video|>' -}}
330
+ {%- set ns.prev_message_type = 'video' -%}
331
+ {%- endif -%}
332
+ {%- endfor -%}
333
+ {%- endif -%}
334
+ {%- endset -%}
335
+
336
+ {{- captured_content -}}
337
+ {%- set has_content = captured_content | trim | length > 0 -%}
338
+
339
+ {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
340
+ {{- '<|tool_response>' -}}
341
+ {%- elif not (ns_tr_out.flag and not has_content) -%}
342
+ {{- '<turn|>\n' -}}
343
+ {%- endif -%}
344
+ {%- endif -%}
345
+ {%- endfor -%}
346
+
347
+ {%- if add_generation_prompt -%}
348
+ {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
349
+ {{- '<|turn>model\n' -}}
350
+ {%- endif -%}
351
+ {%- endif -%}
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ebe97922ef0bee5a2887cb2ee8f12595764d517de7176ed003caf71939844df
3
+ size 71463733
checkpoint-100/processor_config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_ms_per_token": 40,
3
+ "audio_seq_length": 750,
4
+ "feature_extractor": {
5
+ "dither": 0.0,
6
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
7
+ "feature_size": 128,
8
+ "fft_length": 512,
9
+ "fft_overdrive": false,
10
+ "frame_length": 320,
11
+ "hop_length": 160,
12
+ "input_scale_factor": 1.0,
13
+ "max_frequency": 8000.0,
14
+ "mel_floor": 0.001,
15
+ "min_frequency": 0.0,
16
+ "padding_side": "left",
17
+ "padding_value": 0.0,
18
+ "per_bin_mean": null,
19
+ "per_bin_stddev": null,
20
+ "preemphasis": 0.0,
21
+ "preemphasis_htk_flavor": true,
22
+ "return_attention_mask": true,
23
+ "sampling_rate": 16000
24
+ },
25
+ "image_processor": {
26
+ "do_convert_rgb": true,
27
+ "do_normalize": false,
28
+ "do_rescale": true,
29
+ "do_resize": true,
30
+ "image_mean": [
31
+ 0.0,
32
+ 0.0,
33
+ 0.0
34
+ ],
35
+ "image_processor_type": "Gemma4ImageProcessor",
36
+ "image_seq_length": 280,
37
+ "image_std": [
38
+ 1.0,
39
+ 1.0,
40
+ 1.0
41
+ ],
42
+ "max_soft_tokens": 280,
43
+ "patch_size": 16,
44
+ "pooling_kernel_size": 3,
45
+ "resample": 3,
46
+ "rescale_factor": 0.00392156862745098
47
+ },
48
+ "image_seq_length": 280,
49
+ "processor_class": "Gemma4Processor",
50
+ "video_processor": {
51
+ "do_convert_rgb": true,
52
+ "do_normalize": true,
53
+ "do_rescale": true,
54
+ "do_resize": true,
55
+ "do_sample_frames": true,
56
+ "image_mean": [
57
+ 0.0,
58
+ 0.0,
59
+ 0.0
60
+ ],
61
+ "image_std": [
62
+ 1.0,
63
+ 1.0,
64
+ 1.0
65
+ ],
66
+ "max_soft_tokens": 70,
67
+ "num_frames": 32,
68
+ "patch_size": 16,
69
+ "pooling_kernel_size": 3,
70
+ "resample": 3,
71
+ "rescale_factor": 0.00392156862745098,
72
+ "return_metadata": false,
73
+ "video_processor_type": "Gemma4VideoProcessor"
74
+ }
75
+ }
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
+ size 14645
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfa39a08ca6ca0b25c44556fe7464362808ae67fd00d1432e1130777acac8674
3
+ size 1465
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
3
+ size 32169626
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<turn|>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 131072,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "right",
45
+ "processor_class": "Gemma4Processor",
46
+ "response_schema": {
47
+ "properties": {
48
+ "content": {
49
+ "type": "string"
50
+ },
51
+ "role": {
52
+ "const": "assistant"
53
+ },
54
+ "thinking": {
55
+ "type": "string"
56
+ },
57
+ "tool_calls": {
58
+ "items": {
59
+ "properties": {
60
+ "function": {
61
+ "properties": {
62
+ "arguments": {
63
+ "additionalProperties": {},
64
+ "type": "object",
65
+ "x-parser": "gemma4-tool-call"
66
+ },
67
+ "name": {
68
+ "type": "string"
69
+ }
70
+ },
71
+ "type": "object",
72
+ "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
73
+ },
74
+ "type": {
75
+ "const": "function"
76
+ }
77
+ },
78
+ "type": "object"
79
+ },
80
+ "type": "array",
81
+ "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
82
+ }
83
+ },
84
+ "type": "object",
85
+ "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
86
+ },
87
+ "soc_token": "<|channel>",
88
+ "sot_token": "<|turn>",
89
+ "stc_token": "<|tool_call>",
90
+ "std_token": "<|tool>",
91
+ "str_token": "<|tool_response>",
92
+ "think_token": "<|think|>",
93
+ "tokenizer_class": "GemmaTokenizer",
94
+ "unk_token": "<unk>",
95
+ "added_tokens_decoder": {
96
+ "0": {
97
+ "content": "<pad>",
98
+ "single_word": false,
99
+ "lstrip": false,
100
+ "rstrip": false,
101
+ "normalized": false,
102
+ "special": true
103
+ },
104
+ "1": {
105
+ "content": "<eos>",
106
+ "single_word": false,
107
+ "lstrip": false,
108
+ "rstrip": false,
109
+ "normalized": false,
110
+ "special": true
111
+ },
112
+ "2": {
113
+ "content": "<bos>",
114
+ "single_word": false,
115
+ "lstrip": false,
116
+ "rstrip": false,
117
+ "normalized": false,
118
+ "special": true
119
+ },
120
+ "3": {
121
+ "content": "<unk>",
122
+ "single_word": false,
123
+ "lstrip": false,
124
+ "rstrip": false,
125
+ "normalized": false,
126
+ "special": true
127
+ },
128
+ "4": {
129
+ "content": "<mask>",
130
+ "single_word": false,
131
+ "lstrip": false,
132
+ "rstrip": false,
133
+ "normalized": false,
134
+ "special": true
135
+ },
136
+ "46": {
137
+ "content": "<|tool>",
138
+ "single_word": false,
139
+ "lstrip": false,
140
+ "rstrip": false,
141
+ "normalized": false,
142
+ "special": true
143
+ },
144
+ "47": {
145
+ "content": "<tool|>",
146
+ "single_word": false,
147
+ "lstrip": false,
148
+ "rstrip": false,
149
+ "normalized": false,
150
+ "special": true
151
+ },
152
+ "48": {
153
+ "content": "<|tool_call>",
154
+ "single_word": false,
155
+ "lstrip": false,
156
+ "rstrip": false,
157
+ "normalized": false,
158
+ "special": true
159
+ },
160
+ "49": {
161
+ "content": "<tool_call|>",
162
+ "single_word": false,
163
+ "lstrip": false,
164
+ "rstrip": false,
165
+ "normalized": false,
166
+ "special": true
167
+ },
168
+ "50": {
169
+ "content": "<|tool_response>",
170
+ "single_word": false,
171
+ "lstrip": false,
172
+ "rstrip": false,
173
+ "normalized": false,
174
+ "special": true
175
+ },
176
+ "51": {
177
+ "content": "<tool_response|>",
178
+ "single_word": false,
179
+ "lstrip": false,
180
+ "rstrip": false,
181
+ "normalized": false,
182
+ "special": true
183
+ },
184
+ "52": {
185
+ "content": "<|\"|>",
186
+ "single_word": false,
187
+ "lstrip": false,
188
+ "rstrip": false,
189
+ "normalized": false,
190
+ "special": true
191
+ },
192
+ "98": {
193
+ "content": "<|think|>",
194
+ "single_word": false,
195
+ "lstrip": false,
196
+ "rstrip": false,
197
+ "normalized": false,
198
+ "special": true
199
+ },
200
+ "100": {
201
+ "content": "<|channel>",
202
+ "single_word": false,
203
+ "lstrip": false,
204
+ "rstrip": false,
205
+ "normalized": false,
206
+ "special": true
207
+ },
208
+ "101": {
209
+ "content": "<channel|>",
210
+ "single_word": false,
211
+ "lstrip": false,
212
+ "rstrip": false,
213
+ "normalized": false,
214
+ "special": true
215
+ },
216
+ "105": {
217
+ "content": "<|turn>",
218
+ "single_word": false,
219
+ "lstrip": false,
220
+ "rstrip": false,
221
+ "normalized": false,
222
+ "special": true
223
+ },
224
+ "106": {
225
+ "content": "<turn|>",
226
+ "single_word": false,
227
+ "lstrip": false,
228
+ "rstrip": false,
229
+ "normalized": false,
230
+ "special": true
231
+ },
232
+ "255999": {
233
+ "content": "<|image>",
234
+ "single_word": false,
235
+ "lstrip": false,
236
+ "rstrip": false,
237
+ "normalized": false,
238
+ "special": true
239
+ },
240
+ "256000": {
241
+ "content": "<|audio>",
242
+ "single_word": false,
243
+ "lstrip": false,
244
+ "rstrip": false,
245
+ "normalized": false,
246
+ "special": true
247
+ },
248
+ "258880": {
249
+ "content": "<|image|>",
250
+ "single_word": false,
251
+ "lstrip": false,
252
+ "rstrip": false,
253
+ "normalized": false,
254
+ "special": true
255
+ },
256
+ "258881": {
257
+ "content": "<|audio|>",
258
+ "single_word": false,
259
+ "lstrip": false,
260
+ "rstrip": false,
261
+ "normalized": false,
262
+ "special": true
263
+ },
264
+ "258882": {
265
+ "content": "<image|>",
266
+ "single_word": false,
267
+ "lstrip": false,
268
+ "rstrip": false,
269
+ "normalized": false,
270
+ "special": true
271
+ },
272
+ "258883": {
273
+ "content": "<audio|>",
274
+ "single_word": false,
275
+ "lstrip": false,
276
+ "rstrip": false,
277
+ "normalized": false,
278
+ "special": true
279
+ },
280
+ "258884": {
281
+ "content": "<|video|>",
282
+ "single_word": false,
283
+ "lstrip": false,
284
+ "rstrip": false,
285
+ "normalized": false,
286
+ "special": true
287
+ }
288
+ }
289
+ }
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.018195050946142648,
6
+ "eval_steps": 100,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0009097525473071324,
14
+ "grad_norm": 1.0602493286132812,
15
+ "learning_rate": 1.2121212121212122e-06,
16
+ "loss": 1.7156932830810547,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.001819505094614265,
21
+ "grad_norm": 1.1577719449996948,
22
+ "learning_rate": 2.7272727272727272e-06,
23
+ "loss": 1.6629371643066406,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.0027292576419213972,
28
+ "grad_norm": 1.0288419723510742,
29
+ "learning_rate": 4.242424242424243e-06,
30
+ "loss": 1.6706295013427734,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.00363901018922853,
35
+ "grad_norm": 2.129403829574585,
36
+ "learning_rate": 5.7575757575757586e-06,
37
+ "loss": 1.7363752365112304,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.004548762736535662,
42
+ "grad_norm": 1.9468326568603516,
43
+ "learning_rate": 7.272727272727272e-06,
44
+ "loss": 1.7111135482788087,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.0054585152838427945,
49
+ "grad_norm": 1.1269357204437256,
50
+ "learning_rate": 8.787878787878788e-06,
51
+ "loss": 1.6924203872680663,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.006368267831149927,
56
+ "grad_norm": 1.4021248817443848,
57
+ "learning_rate": 1.0303030303030304e-05,
58
+ "loss": 1.658310317993164,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.00727802037845706,
63
+ "grad_norm": 1.313381314277649,
64
+ "learning_rate": 1.1818181818181819e-05,
65
+ "loss": 1.5383296012878418,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.008187772925764192,
70
+ "grad_norm": 2.4359891414642334,
71
+ "learning_rate": 1.3333333333333333e-05,
72
+ "loss": 1.4302565574645996,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.009097525473071324,
77
+ "grad_norm": 1.6459542512893677,
78
+ "learning_rate": 1.484848484848485e-05,
79
+ "loss": 1.2602953910827637,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.010007278020378457,
84
+ "grad_norm": 0.7953159213066101,
85
+ "learning_rate": 1.6363636363636366e-05,
86
+ "loss": 1.204326343536377,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.010917030567685589,
91
+ "grad_norm": 0.5824465155601501,
92
+ "learning_rate": 1.787878787878788e-05,
93
+ "loss": 1.068561840057373,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.011826783114992722,
98
+ "grad_norm": 0.39265626668930054,
99
+ "learning_rate": 1.9393939393939395e-05,
100
+ "loss": 0.9570062637329102,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.012736535662299854,
105
+ "grad_norm": 0.3387283384799957,
106
+ "learning_rate": 2.090909090909091e-05,
107
+ "loss": 0.9454713821411133,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.013646288209606987,
112
+ "grad_norm": 0.3182811141014099,
113
+ "learning_rate": 2.2424242424242424e-05,
114
+ "loss": 0.8901592254638672,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.01455604075691412,
119
+ "grad_norm": 0.2735312879085541,
120
+ "learning_rate": 2.393939393939394e-05,
121
+ "loss": 0.8491583824157715,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.015465793304221253,
126
+ "grad_norm": 0.2376435250043869,
127
+ "learning_rate": 2.5454545454545454e-05,
128
+ "loss": 0.8109179496765136,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.016375545851528384,
133
+ "grad_norm": 0.2161586880683899,
134
+ "learning_rate": 2.696969696969697e-05,
135
+ "loss": 0.76962308883667,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.017285298398835518,
140
+ "grad_norm": 0.19587980210781097,
141
+ "learning_rate": 2.8484848484848486e-05,
142
+ "loss": 0.7301986694335938,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.018195050946142648,
147
+ "grad_norm": 0.20971694588661194,
148
+ "learning_rate": 3e-05,
149
+ "loss": 0.7269618034362793,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.018195050946142648,
154
+ "eval_loss": 2.605874538421631,
155
+ "eval_runtime": 1120.0905,
156
+ "eval_samples_per_second": 33.935,
157
+ "eval_steps_per_second": 8.484,
158
+ "step": 100
159
+ }
160
+ ],
161
+ "logging_steps": 5,
162
+ "max_steps": 5500,
163
+ "num_input_tokens_seen": 0,
164
+ "num_train_epochs": 2,
165
+ "save_steps": 100,
166
+ "stateful_callbacks": {
167
+ "TrainerControl": {
168
+ "args": {
169
+ "should_epoch_stop": false,
170
+ "should_evaluate": false,
171
+ "should_log": false,
172
+ "should_save": true,
173
+ "should_training_stop": false
174
+ },
175
+ "attributes": {}
176
+ }
177
+ },
178
+ "total_flos": 6.444622973392128e+16,
179
+ "train_batch_size": 8,
180
+ "trial_name": null,
181
+ "trial_params": null
182
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:195f79601dec1ad668a414b5c045319cec84f48961f45b7d32762f86750cd8b1
3
+ size 5777
checkpoint-1000/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/gemma-4-E4B-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/gemma-4-E4B-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.19.1
checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma4ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma4.modeling_gemma4",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 16,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "lora_ga_config": null,
27
+ "megatron_config": null,
28
+ "megatron_core": "megatron.core",
29
+ "modules_to_save": null,
30
+ "peft_type": "LORA",
31
+ "peft_version": "0.19.1",
32
+ "qalora_group_size": 16,
33
+ "r": 16,
34
+ "rank_pattern": {},
35
+ "revision": null,
36
+ "target_modules": [
37
+ "gate_proj",
38
+ "v_proj",
39
+ "o_proj",
40
+ "k_proj",
41
+ "up_proj",
42
+ "down_proj",
43
+ "q_proj"
44
+ ],
45
+ "target_parameters": null,
46
+ "task_type": "CAUSAL_LM",
47
+ "trainable_token_indices": null,
48
+ "use_bdlora": null,
49
+ "use_dora": false,
50
+ "use_qalora": false,
51
+ "use_rslora": false
52
+ }
checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94c7dd4d79ecdb435c295a616d4707c2bf0e734fbefe7d10ecfa59b195ee625
3
+ size 169741912
checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required, filter_keys=false) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- set add_comma = false -%}
6
+ {%- if not filter_keys or key not in standard_keys -%}
7
+ {%- if ns.found_first %},{% endif -%}
8
+ {%- set ns.found_first = true -%}
9
+ {{ key }}:{
10
+ {%- if value['description'] -%}
11
+ description:<|"|>{{ value['description'] }}<|"|>
12
+ {%- set add_comma = true -%}
13
+ {%- endif -%}
14
+ {%- if value['type'] | upper == 'STRING' -%}
15
+ {%- if value['enum'] -%}
16
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
+ enum:{{ format_argument(value['enum']) }}
18
+ {%- endif -%}
19
+ {%- elif value['type'] | upper == 'ARRAY' -%}
20
+ {%- if value['items'] is mapping and value['items'] -%}
21
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
+ items:{
23
+ {%- set ns_items = namespace(found_first=false) -%}
24
+ {%- for item_key, item_value in value['items'] | dictsort -%}
25
+ {%- if item_value is not none -%}
26
+ {%- if ns_items.found_first %},{% endif -%}
27
+ {%- set ns_items.found_first = true -%}
28
+ {%- if item_key == 'properties' -%}
29
+ properties:{
30
+ {%- if item_value is mapping -%}
31
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
32
+ {%- endif -%}
33
+ }
34
+ {%- elif item_key == 'required' -%}
35
+ required:[
36
+ {%- for req_item in item_value -%}
37
+ <|"|>{{- req_item -}}<|"|>
38
+ {%- if not loop.last %},{% endif -%}
39
+ {%- endfor -%}
40
+ ]
41
+ {%- elif item_key == 'type' -%}
42
+ {%- if item_value is string -%}
43
+ type:{{ format_argument(item_value | upper) }}
44
+ {%- else -%}
45
+ type:{{ format_argument(item_value | map('upper') | list) }}
46
+ {%- endif -%}
47
+ {%- else -%}
48
+ {{ item_key }}:{{ format_argument(item_value) }}
49
+ {%- endif -%}
50
+ {%- endif -%}
51
+ {%- endfor -%}
52
+ }
53
+ {%- endif -%}
54
+ {%- endif -%}
55
+ {%- if value['nullable'] %}
56
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
+ nullable:true
58
+ {%- endif -%}
59
+ {%- if value['type'] | upper == 'OBJECT' -%}
60
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
+ properties:{
63
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
+ }
65
+ {%- elif value is mapping -%}
66
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
+ properties:{
68
+ {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
69
+ }
70
+ {%- endif -%}
71
+ {%- if value['required'] -%}
72
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
+ required:[
74
+ {%- for item in value['required'] | default([]) -%}
75
+ <|"|>{{- item -}}<|"|>
76
+ {%- if not loop.last %},{% endif -%}
77
+ {%- endfor -%}
78
+ ]
79
+ {%- endif -%}
80
+ {%- endif -%}
81
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
+ type:<|"|>{{ value['type'] | upper }}<|"|>}
83
+ {%- endif -%}
84
+ {%- endfor -%}
85
+ {%- endmacro -%}
86
+ {%- macro format_function_declaration(tool_data) -%}
87
+ declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
88
+ {%- set params = tool_data['function']['parameters'] -%}
89
+ {%- if params -%}
90
+ ,parameters:{
91
+ {%- if params['properties'] -%}
92
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
93
+ {%- endif -%}
94
+ {%- if params['required'] -%}
95
+ required:[
96
+ {%- for item in params['required'] -%}
97
+ <|"|>{{- item -}}<|"|>
98
+ {{- ',' if not loop.last -}}
99
+ {%- endfor -%}
100
+ ],
101
+ {%- endif -%}
102
+ {%- if params['type'] -%}
103
+ type:<|"|>{{- params['type'] | upper -}}<|"|>}
104
+ {%- endif -%}
105
+ {%- endif -%}
106
+ {%- if 'response' in tool_data['function'] -%}
107
+ {%- set response_declaration = tool_data['function']['response'] -%}
108
+ ,response:{
109
+ {%- if response_declaration['description'] -%}
110
+ description:<|"|>{{- response_declaration['description'] -}}<|"|>,
111
+ {%- endif -%}
112
+ {%- if response_declaration['type'] | upper == 'OBJECT' -%}
113
+ type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
114
+ {%- endif -%}
115
+ {%- endif -%}
116
+ }
117
+ {%- endmacro -%}
118
+ {%- macro format_argument(argument, escape_keys=True) -%}
119
+ {%- if argument is string -%}
120
+ {{- '<|"|>' + argument + '<|"|>' -}}
121
+ {%- elif argument is boolean -%}
122
+ {{- 'true' if argument else 'false' -}}
123
+ {%- elif argument is mapping -%}
124
+ {{- '{' -}}
125
+ {%- set ns = namespace(found_first=false) -%}
126
+ {%- for key, value in argument | dictsort -%}
127
+ {%- if ns.found_first %},{% endif -%}
128
+ {%- set ns.found_first = true -%}
129
+ {%- if escape_keys -%}
130
+ {{- '<|"|>' + key + '<|"|>' -}}
131
+ {%- else -%}
132
+ {{- key -}}
133
+ {%- endif -%}
134
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
135
+ {%- endfor -%}
136
+ {{- '}' -}}
137
+ {%- elif argument is sequence -%}
138
+ {{- '[' -}}
139
+ {%- for item in argument -%}
140
+ {{- format_argument(item, escape_keys=escape_keys) -}}
141
+ {%- if not loop.last %},{% endif -%}
142
+ {%- endfor -%}
143
+ {{- ']' -}}
144
+ {%- else -%}
145
+ {{- argument -}}
146
+ {%- endif -%}
147
+ {%- endmacro -%}
148
+ {%- macro strip_thinking(text) -%}
149
+ {%- set ns = namespace(result='') -%}
150
+ {%- for part in text.split('<channel|>') -%}
151
+ {%- if '<|channel>' in part -%}
152
+ {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
153
+ {%- else -%}
154
+ {%- set ns.result = ns.result + part -%}
155
+ {%- endif -%}
156
+ {%- endfor -%}
157
+ {{- ns.result | trim -}}
158
+ {%- endmacro -%}
159
+
160
+ {%- macro format_tool_response_block(tool_name, response) -%}
161
+ {{- '<|tool_response>' -}}
162
+ {%- if response is mapping -%}
163
+ {{- 'response:' + tool_name + '{' -}}
164
+ {%- for key, value in response | dictsort -%}
165
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
+ {%- if not loop.last %},{% endif -%}
167
+ {%- endfor -%}
168
+ {{- '}' -}}
169
+ {%- else -%}
170
+ {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
+ {%- endif -%}
172
+ {{- '<tool_response|>' -}}
173
+ {%- endmacro -%}
174
+
175
+ {%- set ns = namespace(prev_message_type=None) -%}
176
+ {%- set loop_messages = messages -%}
177
+ {{- bos_token -}}
178
+ {#- Handle System/Tool Definitions Block -#}
179
+ {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
+ {{- '<|turn>system\n' -}}
181
+ {#- Inject Thinking token at the very top of the FIRST system turn -#}
182
+ {%- if enable_thinking is defined and enable_thinking -%}
183
+ {{- '<|think|>\n' -}}
184
+ {%- set ns.prev_message_type = 'think' -%}
185
+ {%- endif -%}
186
+ {%- if messages[0]['role'] in ['system', 'developer'] -%}
187
+ {%- if messages[0]['content'] is string -%}
188
+ {{- messages[0]['content'] | trim -}}
189
+ {%- elif messages[0]['content'] is sequence -%}
190
+ {%- for item in messages[0]['content'] -%}
191
+ {{- item['text'] | trim + ' '-}}
192
+ {%- endfor -%}
193
+ {%- endif -%}
194
+ {%- set loop_messages = messages[1:] -%}
195
+ {%- endif -%}
196
+ {%- if tools -%}
197
+ {%- for tool in tools %}
198
+ {{- '<|tool>' -}}
199
+ {{- format_function_declaration(tool) | trim -}}
200
+ {{- '<tool|>' -}}
201
+ {%- endfor %}
202
+ {%- set ns.prev_message_type = 'tool' -%}
203
+ {%- endif -%}
204
+ {{- '<turn|>\n' -}}
205
+ {%- endif %}
206
+
207
+ {#- Pre-scan: find last user message index for reasoning guard -#}
208
+ {%- set ns_turn = namespace(last_user_idx=-1) -%}
209
+ {%- for i in range(loop_messages | length) -%}
210
+ {%- if loop_messages[i]['role'] == 'user' -%}
211
+ {%- set ns_turn.last_user_idx = i -%}
212
+ {%- endif -%}
213
+ {%- endfor -%}
214
+
215
+ {#- Loop through messages -#}
216
+ {%- for message in loop_messages -%}
217
+ {%- if message['role'] != 'tool' -%}
218
+ {%- set ns.prev_message_type = None -%}
219
+ {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
220
+ {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
221
+ {%- set prev_nt = namespace(role=None, found=false) -%}
222
+ {%- if loop.index0 > 0 -%}
223
+ {%- for j in range(loop.index0 - 1, -1, -1) -%}
224
+ {%- if not prev_nt.found -%}
225
+ {%- if loop_messages[j]['role'] != 'tool' -%}
226
+ {%- set prev_nt.role = loop_messages[j]['role'] -%}
227
+ {%- set prev_nt.found = true -%}
228
+ {%- endif -%}
229
+ {%- endif -%}
230
+ {%- endfor -%}
231
+ {%- endif -%}
232
+ {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
233
+ {%- if not continue_same_model_turn -%}
234
+ {{- '<|turn>' + role + '\n' }}
235
+ {%- endif -%}
236
+
237
+ {#- Render reasoning/reasoning_content as thinking channel -#}
238
+ {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
239
+ {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
240
+ {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
241
+ {%- endif -%}
242
+
243
+ {%- if message['tool_calls'] -%}
244
+ {%- for tool_call in message['tool_calls'] -%}
245
+ {%- set function = tool_call['function'] -%}
246
+ {{- '<|tool_call>call:' + function['name'] + '{' -}}
247
+ {%- if function['arguments'] is mapping -%}
248
+ {%- set ns_args = namespace(found_first=false) -%}
249
+ {%- for key, value in function['arguments'] | dictsort -%}
250
+ {%- if ns_args.found_first %},{% endif -%}
251
+ {%- set ns_args.found_first = true -%}
252
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
253
+ {%- endfor -%}
254
+ {%- elif function['arguments'] is string -%}
255
+ {{- function['arguments'] -}}
256
+ {%- endif -%}
257
+ {{- '}<tool_call|>' -}}
258
+ {%- endfor -%}
259
+ {%- set ns.prev_message_type = 'tool_call' -%}
260
+ {%- endif -%}
261
+
262
+ {%- set ns_tr_out = namespace(flag=false) -%}
263
+ {%- if message.get('tool_responses') -%}
264
+ {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
265
+ {%- for tool_response in message['tool_responses'] -%}
266
+ {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
267
+ {%- set ns_tr_out.flag = true -%}
268
+ {%- set ns.prev_message_type = 'tool_response' -%}
269
+ {%- endfor -%}
270
+ {%- elif message.get('tool_calls') -%}
271
+ {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
272
+ {%- set ns_tool_scan = namespace(stopped=false) -%}
273
+ {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
274
+ {%- if ns_tool_scan.stopped -%}
275
+ {%- elif loop_messages[k]['role'] != 'tool' -%}
276
+ {%- set ns_tool_scan.stopped = true -%}
277
+ {%- else -%}
278
+ {%- set follow = loop_messages[k] -%}
279
+ {#- Resolve tool_call_id to function name -#}
280
+ {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
281
+ {%- for tc in message['tool_calls'] -%}
282
+ {%- if tc.get('id') == follow.get('tool_call_id') -%}
283
+ {%- set ns_tname.name = tc['function']['name'] -%}
284
+ {%- endif -%}
285
+ {%- endfor -%}
286
+ {#- Handle content as string or content-parts array -#}
287
+ {%- set tool_body = follow.get('content') -%}
288
+ {%- if tool_body is string -%}
289
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
290
+ {%- elif tool_body is sequence and tool_body is not string -%}
291
+ {%- set ns_txt = namespace(s='') -%}
292
+ {%- for part in tool_body -%}
293
+ {%- if part.get('type') == 'text' -%}
294
+ {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
295
+ {%- endif -%}
296
+ {%- endfor -%}
297
+ {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
298
+ {%- else -%}
299
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
300
+ {%- endif -%}
301
+ {%- set ns_tr_out.flag = true -%}
302
+ {%- set ns.prev_message_type = 'tool_response' -%}
303
+ {%- endif -%}
304
+ {%- endfor -%}
305
+ {%- endif -%}
306
+
307
+ {%- set captured_content -%}
308
+ {%- if message['content'] is string -%}
309
+ {%- if role == 'model' -%}
310
+ {{- strip_thinking(message['content']) -}}
311
+ {%- else -%}
312
+ {{- message['content'] | trim -}}
313
+ {%- endif -%}
314
+ {%- elif message['content'] is sequence -%}
315
+ {%- for item in message['content'] -%}
316
+ {%- if item['type'] == 'text' -%}
317
+ {%- if role == 'model' -%}
318
+ {{- strip_thinking(item['text']) -}}
319
+ {%- else -%}
320
+ {{- item['text'] | trim -}}
321
+ {%- endif -%}
322
+ {%- elif item['type'] == 'image' -%}
323
+ {{- '<|image|>' -}}
324
+ {%- set ns.prev_message_type = 'image' -%}
325
+ {%- elif item['type'] == 'audio' -%}
326
+ {{- '<|audio|>' -}}
327
+ {%- set ns.prev_message_type = 'audio' -%}
328
+ {%- elif item['type'] == 'video' -%}
329
+ {{- '<|video|>' -}}
330
+ {%- set ns.prev_message_type = 'video' -%}
331
+ {%- endif -%}
332
+ {%- endfor -%}
333
+ {%- endif -%}
334
+ {%- endset -%}
335
+
336
+ {{- captured_content -}}
337
+ {%- set has_content = captured_content | trim | length > 0 -%}
338
+
339
+ {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
340
+ {{- '<|tool_response>' -}}
341
+ {%- elif not (ns_tr_out.flag and not has_content) -%}
342
+ {{- '<turn|>\n' -}}
343
+ {%- endif -%}
344
+ {%- endif -%}
345
+ {%- endfor -%}
346
+
347
+ {%- if add_generation_prompt -%}
348
+ {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
349
+ {{- '<|turn>model\n' -}}
350
+ {%- endif -%}
351
+ {%- endif -%}
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:795a63e9a73654a7dd8a4dac66a5a2b305d11f32784400415681ec19ef91f007
3
+ size 72807355
checkpoint-1000/processor_config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_ms_per_token": 40,
3
+ "audio_seq_length": 750,
4
+ "feature_extractor": {
5
+ "dither": 0.0,
6
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
7
+ "feature_size": 128,
8
+ "fft_length": 512,
9
+ "fft_overdrive": false,
10
+ "frame_length": 320,
11
+ "hop_length": 160,
12
+ "input_scale_factor": 1.0,
13
+ "max_frequency": 8000.0,
14
+ "mel_floor": 0.001,
15
+ "min_frequency": 0.0,
16
+ "padding_side": "left",
17
+ "padding_value": 0.0,
18
+ "per_bin_mean": null,
19
+ "per_bin_stddev": null,
20
+ "preemphasis": 0.0,
21
+ "preemphasis_htk_flavor": true,
22
+ "return_attention_mask": true,
23
+ "sampling_rate": 16000
24
+ },
25
+ "image_processor": {
26
+ "do_convert_rgb": true,
27
+ "do_normalize": false,
28
+ "do_rescale": true,
29
+ "do_resize": true,
30
+ "image_mean": [
31
+ 0.0,
32
+ 0.0,
33
+ 0.0
34
+ ],
35
+ "image_processor_type": "Gemma4ImageProcessor",
36
+ "image_seq_length": 280,
37
+ "image_std": [
38
+ 1.0,
39
+ 1.0,
40
+ 1.0
41
+ ],
42
+ "max_soft_tokens": 280,
43
+ "patch_size": 16,
44
+ "pooling_kernel_size": 3,
45
+ "resample": 3,
46
+ "rescale_factor": 0.00392156862745098
47
+ },
48
+ "image_seq_length": 280,
49
+ "processor_class": "Gemma4Processor",
50
+ "video_processor": {
51
+ "do_convert_rgb": true,
52
+ "do_normalize": true,
53
+ "do_rescale": true,
54
+ "do_resize": true,
55
+ "do_sample_frames": true,
56
+ "image_mean": [
57
+ 0.0,
58
+ 0.0,
59
+ 0.0
60
+ ],
61
+ "image_std": [
62
+ 1.0,
63
+ 1.0,
64
+ 1.0
65
+ ],
66
+ "max_soft_tokens": 70,
67
+ "num_frames": 32,
68
+ "patch_size": 16,
69
+ "pooling_kernel_size": 3,
70
+ "resample": 3,
71
+ "rescale_factor": 0.00392156862745098,
72
+ "return_metadata": false,
73
+ "video_processor_type": "Gemma4VideoProcessor"
74
+ }
75
+ }
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
+ size 14645
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:406994c2cf2acc1e48ce8857e7cbb9e95d4fab92a97bbe36f71721705be347d7
3
+ size 1465
checkpoint-1000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
3
+ size 32169626
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<turn|>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 131072,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "right",
45
+ "processor_class": "Gemma4Processor",
46
+ "response_schema": {
47
+ "properties": {
48
+ "content": {
49
+ "type": "string"
50
+ },
51
+ "role": {
52
+ "const": "assistant"
53
+ },
54
+ "thinking": {
55
+ "type": "string"
56
+ },
57
+ "tool_calls": {
58
+ "items": {
59
+ "properties": {
60
+ "function": {
61
+ "properties": {
62
+ "arguments": {
63
+ "additionalProperties": {},
64
+ "type": "object",
65
+ "x-parser": "gemma4-tool-call"
66
+ },
67
+ "name": {
68
+ "type": "string"
69
+ }
70
+ },
71
+ "type": "object",
72
+ "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
73
+ },
74
+ "type": {
75
+ "const": "function"
76
+ }
77
+ },
78
+ "type": "object"
79
+ },
80
+ "type": "array",
81
+ "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
82
+ }
83
+ },
84
+ "type": "object",
85
+ "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
86
+ },
87
+ "soc_token": "<|channel>",
88
+ "sot_token": "<|turn>",
89
+ "stc_token": "<|tool_call>",
90
+ "std_token": "<|tool>",
91
+ "str_token": "<|tool_response>",
92
+ "think_token": "<|think|>",
93
+ "tokenizer_class": "GemmaTokenizer",
94
+ "unk_token": "<unk>",
95
+ "added_tokens_decoder": {
96
+ "0": {
97
+ "content": "<pad>",
98
+ "single_word": false,
99
+ "lstrip": false,
100
+ "rstrip": false,
101
+ "normalized": false,
102
+ "special": true
103
+ },
104
+ "1": {
105
+ "content": "<eos>",
106
+ "single_word": false,
107
+ "lstrip": false,
108
+ "rstrip": false,
109
+ "normalized": false,
110
+ "special": true
111
+ },
112
+ "2": {
113
+ "content": "<bos>",
114
+ "single_word": false,
115
+ "lstrip": false,
116
+ "rstrip": false,
117
+ "normalized": false,
118
+ "special": true
119
+ },
120
+ "3": {
121
+ "content": "<unk>",
122
+ "single_word": false,
123
+ "lstrip": false,
124
+ "rstrip": false,
125
+ "normalized": false,
126
+ "special": true
127
+ },
128
+ "4": {
129
+ "content": "<mask>",
130
+ "single_word": false,
131
+ "lstrip": false,
132
+ "rstrip": false,
133
+ "normalized": false,
134
+ "special": true
135
+ },
136
+ "46": {
137
+ "content": "<|tool>",
138
+ "single_word": false,
139
+ "lstrip": false,
140
+ "rstrip": false,
141
+ "normalized": false,
142
+ "special": true
143
+ },
144
+ "47": {
145
+ "content": "<tool|>",
146
+ "single_word": false,
147
+ "lstrip": false,
148
+ "rstrip": false,
149
+ "normalized": false,
150
+ "special": true
151
+ },
152
+ "48": {
153
+ "content": "<|tool_call>",
154
+ "single_word": false,
155
+ "lstrip": false,
156
+ "rstrip": false,
157
+ "normalized": false,
158
+ "special": true
159
+ },
160
+ "49": {
161
+ "content": "<tool_call|>",
162
+ "single_word": false,
163
+ "lstrip": false,
164
+ "rstrip": false,
165
+ "normalized": false,
166
+ "special": true
167
+ },
168
+ "50": {
169
+ "content": "<|tool_response>",
170
+ "single_word": false,
171
+ "lstrip": false,
172
+ "rstrip": false,
173
+ "normalized": false,
174
+ "special": true
175
+ },
176
+ "51": {
177
+ "content": "<tool_response|>",
178
+ "single_word": false,
179
+ "lstrip": false,
180
+ "rstrip": false,
181
+ "normalized": false,
182
+ "special": true
183
+ },
184
+ "52": {
185
+ "content": "<|\"|>",
186
+ "single_word": false,
187
+ "lstrip": false,
188
+ "rstrip": false,
189
+ "normalized": false,
190
+ "special": true
191
+ },
192
+ "98": {
193
+ "content": "<|think|>",
194
+ "single_word": false,
195
+ "lstrip": false,
196
+ "rstrip": false,
197
+ "normalized": false,
198
+ "special": true
199
+ },
200
+ "100": {
201
+ "content": "<|channel>",
202
+ "single_word": false,
203
+ "lstrip": false,
204
+ "rstrip": false,
205
+ "normalized": false,
206
+ "special": true
207
+ },
208
+ "101": {
209
+ "content": "<channel|>",
210
+ "single_word": false,
211
+ "lstrip": false,
212
+ "rstrip": false,
213
+ "normalized": false,
214
+ "special": true
215
+ },
216
+ "105": {
217
+ "content": "<|turn>",
218
+ "single_word": false,
219
+ "lstrip": false,
220
+ "rstrip": false,
221
+ "normalized": false,
222
+ "special": true
223
+ },
224
+ "106": {
225
+ "content": "<turn|>",
226
+ "single_word": false,
227
+ "lstrip": false,
228
+ "rstrip": false,
229
+ "normalized": false,
230
+ "special": true
231
+ },
232
+ "255999": {
233
+ "content": "<|image>",
234
+ "single_word": false,
235
+ "lstrip": false,
236
+ "rstrip": false,
237
+ "normalized": false,
238
+ "special": true
239
+ },
240
+ "256000": {
241
+ "content": "<|audio>",
242
+ "single_word": false,
243
+ "lstrip": false,
244
+ "rstrip": false,
245
+ "normalized": false,
246
+ "special": true
247
+ },
248
+ "258880": {
249
+ "content": "<|image|>",
250
+ "single_word": false,
251
+ "lstrip": false,
252
+ "rstrip": false,
253
+ "normalized": false,
254
+ "special": true
255
+ },
256
+ "258881": {
257
+ "content": "<|audio|>",
258
+ "single_word": false,
259
+ "lstrip": false,
260
+ "rstrip": false,
261
+ "normalized": false,
262
+ "special": true
263
+ },
264
+ "258882": {
265
+ "content": "<image|>",
266
+ "single_word": false,
267
+ "lstrip": false,
268
+ "rstrip": false,
269
+ "normalized": false,
270
+ "special": true
271
+ },
272
+ "258883": {
273
+ "content": "<audio|>",
274
+ "single_word": false,
275
+ "lstrip": false,
276
+ "rstrip": false,
277
+ "normalized": false,
278
+ "special": true
279
+ },
280
+ "258884": {
281
+ "content": "<|video|>",
282
+ "single_word": false,
283
+ "lstrip": false,
284
+ "rstrip": false,
285
+ "normalized": false,
286
+ "special": true
287
+ }
288
+ }
289
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.1819505094614265,
6
+ "eval_steps": 100,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0009097525473071324,
14
+ "grad_norm": 1.0602493286132812,
15
+ "learning_rate": 1.2121212121212122e-06,
16
+ "loss": 1.7156932830810547,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.001819505094614265,
21
+ "grad_norm": 1.1577719449996948,
22
+ "learning_rate": 2.7272727272727272e-06,
23
+ "loss": 1.6629371643066406,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.0027292576419213972,
28
+ "grad_norm": 1.0288419723510742,
29
+ "learning_rate": 4.242424242424243e-06,
30
+ "loss": 1.6706295013427734,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.00363901018922853,
35
+ "grad_norm": 2.129403829574585,
36
+ "learning_rate": 5.7575757575757586e-06,
37
+ "loss": 1.7363752365112304,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.004548762736535662,
42
+ "grad_norm": 1.9468326568603516,
43
+ "learning_rate": 7.272727272727272e-06,
44
+ "loss": 1.7111135482788087,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.0054585152838427945,
49
+ "grad_norm": 1.1269357204437256,
50
+ "learning_rate": 8.787878787878788e-06,
51
+ "loss": 1.6924203872680663,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.006368267831149927,
56
+ "grad_norm": 1.4021248817443848,
57
+ "learning_rate": 1.0303030303030304e-05,
58
+ "loss": 1.658310317993164,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.00727802037845706,
63
+ "grad_norm": 1.313381314277649,
64
+ "learning_rate": 1.1818181818181819e-05,
65
+ "loss": 1.5383296012878418,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.008187772925764192,
70
+ "grad_norm": 2.4359891414642334,
71
+ "learning_rate": 1.3333333333333333e-05,
72
+ "loss": 1.4302565574645996,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.009097525473071324,
77
+ "grad_norm": 1.6459542512893677,
78
+ "learning_rate": 1.484848484848485e-05,
79
+ "loss": 1.2602953910827637,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.010007278020378457,
84
+ "grad_norm": 0.7953159213066101,
85
+ "learning_rate": 1.6363636363636366e-05,
86
+ "loss": 1.204326343536377,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.010917030567685589,
91
+ "grad_norm": 0.5824465155601501,
92
+ "learning_rate": 1.787878787878788e-05,
93
+ "loss": 1.068561840057373,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.011826783114992722,
98
+ "grad_norm": 0.39265626668930054,
99
+ "learning_rate": 1.9393939393939395e-05,
100
+ "loss": 0.9570062637329102,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.012736535662299854,
105
+ "grad_norm": 0.3387283384799957,
106
+ "learning_rate": 2.090909090909091e-05,
107
+ "loss": 0.9454713821411133,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.013646288209606987,
112
+ "grad_norm": 0.3182811141014099,
113
+ "learning_rate": 2.2424242424242424e-05,
114
+ "loss": 0.8901592254638672,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.01455604075691412,
119
+ "grad_norm": 0.2735312879085541,
120
+ "learning_rate": 2.393939393939394e-05,
121
+ "loss": 0.8491583824157715,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.015465793304221253,
126
+ "grad_norm": 0.2376435250043869,
127
+ "learning_rate": 2.5454545454545454e-05,
128
+ "loss": 0.8109179496765136,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.016375545851528384,
133
+ "grad_norm": 0.2161586880683899,
134
+ "learning_rate": 2.696969696969697e-05,
135
+ "loss": 0.76962308883667,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.017285298398835518,
140
+ "grad_norm": 0.19587980210781097,
141
+ "learning_rate": 2.8484848484848486e-05,
142
+ "loss": 0.7301986694335938,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.018195050946142648,
147
+ "grad_norm": 0.20971694588661194,
148
+ "learning_rate": 3e-05,
149
+ "loss": 0.7269618034362793,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.018195050946142648,
154
+ "eval_loss": 2.605874538421631,
155
+ "eval_runtime": 1120.0905,
156
+ "eval_samples_per_second": 33.935,
157
+ "eval_steps_per_second": 8.484,
158
+ "step": 100
159
+ },
160
+ {
161
+ "epoch": 0.01910480349344978,
162
+ "grad_norm": 0.10413152724504471,
163
+ "learning_rate": 3.151515151515151e-05,
164
+ "loss": 0.3250573635101318,
165
+ "step": 105
166
+ },
167
+ {
168
+ "epoch": 0.020014556040756915,
169
+ "grad_norm": 0.09383206814527512,
170
+ "learning_rate": 3.303030303030303e-05,
171
+ "loss": 0.3277724742889404,
172
+ "step": 110
173
+ },
174
+ {
175
+ "epoch": 0.020924308588064048,
176
+ "grad_norm": 0.1195850670337677,
177
+ "learning_rate": 3.454545454545455e-05,
178
+ "loss": 0.3215961217880249,
179
+ "step": 115
180
+ },
181
+ {
182
+ "epoch": 0.021834061135371178,
183
+ "grad_norm": 0.0715397521853447,
184
+ "learning_rate": 3.606060606060606e-05,
185
+ "loss": 0.3120795965194702,
186
+ "step": 120
187
+ },
188
+ {
189
+ "epoch": 0.02274381368267831,
190
+ "grad_norm": 0.068007692694664,
191
+ "learning_rate": 3.757575757575758e-05,
192
+ "loss": 0.2964257955551147,
193
+ "step": 125
194
+ },
195
+ {
196
+ "epoch": 0.023653566229985445,
197
+ "grad_norm": 0.09345484524965286,
198
+ "learning_rate": 3.909090909090909e-05,
199
+ "loss": 0.30776252746582033,
200
+ "step": 130
201
+ },
202
+ {
203
+ "epoch": 0.024563318777292575,
204
+ "grad_norm": 0.05577846243977547,
205
+ "learning_rate": 4.0606060606060606e-05,
206
+ "loss": 0.3180255889892578,
207
+ "step": 135
208
+ },
209
+ {
210
+ "epoch": 0.025473071324599708,
211
+ "grad_norm": 0.05919989198446274,
212
+ "learning_rate": 4.212121212121212e-05,
213
+ "loss": 0.31608285903930666,
214
+ "step": 140
215
+ },
216
+ {
217
+ "epoch": 0.02638282387190684,
218
+ "grad_norm": 0.05644674599170685,
219
+ "learning_rate": 4.3636363636363636e-05,
220
+ "loss": 0.2993780136108398,
221
+ "step": 145
222
+ },
223
+ {
224
+ "epoch": 0.027292576419213975,
225
+ "grad_norm": 0.059986088424921036,
226
+ "learning_rate": 4.515151515151516e-05,
227
+ "loss": 0.2931638479232788,
228
+ "step": 150
229
+ },
230
+ {
231
+ "epoch": 0.028202328966521105,
232
+ "grad_norm": 0.05941484495997429,
233
+ "learning_rate": 4.666666666666667e-05,
234
+ "loss": 0.29284651279449464,
235
+ "step": 155
236
+ },
237
+ {
238
+ "epoch": 0.02911208151382824,
239
+ "grad_norm": 0.0579044483602047,
240
+ "learning_rate": 4.8181818181818186e-05,
241
+ "loss": 0.2927037000656128,
242
+ "step": 160
243
+ },
244
+ {
245
+ "epoch": 0.030021834061135372,
246
+ "grad_norm": 0.061985693871974945,
247
+ "learning_rate": 4.9696969696969694e-05,
248
+ "loss": 0.28671720027923586,
249
+ "step": 165
250
+ },
251
+ {
252
+ "epoch": 0.030931586608442505,
253
+ "grad_norm": 0.05715535953640938,
254
+ "learning_rate": 4.999993064772809e-05,
255
+ "loss": 0.2817929744720459,
256
+ "step": 170
257
+ },
258
+ {
259
+ "epoch": 0.03184133915574964,
260
+ "grad_norm": 0.06549780815839767,
261
+ "learning_rate": 4.999964890478288e-05,
262
+ "loss": 0.27853829860687257,
263
+ "step": 175
264
+ },
265
+ {
266
+ "epoch": 0.03275109170305677,
267
+ "grad_norm": 0.05948757752776146,
268
+ "learning_rate": 4.999915043908795e-05,
269
+ "loss": 0.27522289752960205,
270
+ "step": 180
271
+ },
272
+ {
273
+ "epoch": 0.0336608442503639,
274
+ "grad_norm": 0.06262889504432678,
275
+ "learning_rate": 4.9998435254964515e-05,
276
+ "loss": 0.270997428894043,
277
+ "step": 185
278
+ },
279
+ {
280
+ "epoch": 0.034570596797671035,
281
+ "grad_norm": 0.06916829943656921,
282
+ "learning_rate": 4.999750335861253e-05,
283
+ "loss": 0.2788438558578491,
284
+ "step": 190
285
+ },
286
+ {
287
+ "epoch": 0.035480349344978165,
288
+ "grad_norm": 0.06128217652440071,
289
+ "learning_rate": 4.9996354758110624e-05,
290
+ "loss": 0.25649352073669435,
291
+ "step": 195
292
+ },
293
+ {
294
+ "epoch": 0.036390101892285295,
295
+ "grad_norm": 0.06704027950763702,
296
+ "learning_rate": 4.999498946341606e-05,
297
+ "loss": 0.25619523525238036,
298
+ "step": 200
299
+ },
300
+ {
301
+ "epoch": 0.03729985443959243,
302
+ "grad_norm": 0.061678580939769745,
303
+ "learning_rate": 4.999340748636462e-05,
304
+ "loss": 0.24956226348876953,
305
+ "step": 205
306
+ },
307
+ {
308
+ "epoch": 0.03820960698689956,
309
+ "grad_norm": 0.07328873127698898,
310
+ "learning_rate": 4.999160884067051e-05,
311
+ "loss": 0.26169676780700685,
312
+ "step": 210
313
+ },
314
+ {
315
+ "epoch": 0.0391193595342067,
316
+ "grad_norm": 0.08287990838289261,
317
+ "learning_rate": 4.9989593541926246e-05,
318
+ "loss": 0.2574604034423828,
319
+ "step": 215
320
+ },
321
+ {
322
+ "epoch": 0.04002911208151383,
323
+ "grad_norm": 0.06787359714508057,
324
+ "learning_rate": 4.9987361607602525e-05,
325
+ "loss": 0.25351409912109374,
326
+ "step": 220
327
+ },
328
+ {
329
+ "epoch": 0.04093886462882096,
330
+ "grad_norm": 0.06695502996444702,
331
+ "learning_rate": 4.998491305704805e-05,
332
+ "loss": 0.24522039890289307,
333
+ "step": 225
334
+ },
335
+ {
336
+ "epoch": 0.041848617176128096,
337
+ "grad_norm": 0.08872214704751968,
338
+ "learning_rate": 4.9982247911489375e-05,
339
+ "loss": 0.2581867933273315,
340
+ "step": 230
341
+ },
342
+ {
343
+ "epoch": 0.042758369723435226,
344
+ "grad_norm": 0.07637131959199905,
345
+ "learning_rate": 4.9979366194030743e-05,
346
+ "loss": 0.25569658279418944,
347
+ "step": 235
348
+ },
349
+ {
350
+ "epoch": 0.043668122270742356,
351
+ "grad_norm": 0.08158119022846222,
352
+ "learning_rate": 4.997626792965385e-05,
353
+ "loss": 0.2529409646987915,
354
+ "step": 240
355
+ },
356
+ {
357
+ "epoch": 0.04457787481804949,
358
+ "grad_norm": 0.07529161125421524,
359
+ "learning_rate": 4.997295314521766e-05,
360
+ "loss": 0.24049024581909179,
361
+ "step": 245
362
+ },
363
+ {
364
+ "epoch": 0.04548762736535662,
365
+ "grad_norm": 0.08860139548778534,
366
+ "learning_rate": 4.996942186945813e-05,
367
+ "loss": 0.2490522861480713,
368
+ "step": 250
369
+ },
370
+ {
371
+ "epoch": 0.04639737991266375,
372
+ "grad_norm": 0.0850321501493454,
373
+ "learning_rate": 4.9965674132988005e-05,
374
+ "loss": 0.24180831909179687,
375
+ "step": 255
376
+ },
377
+ {
378
+ "epoch": 0.04730713245997089,
379
+ "grad_norm": 0.07556115090847015,
380
+ "learning_rate": 4.996170996829653e-05,
381
+ "loss": 0.2509631872177124,
382
+ "step": 260
383
+ },
384
+ {
385
+ "epoch": 0.04821688500727802,
386
+ "grad_norm": 0.07971206307411194,
387
+ "learning_rate": 4.995752940974918e-05,
388
+ "loss": 0.24398891925811766,
389
+ "step": 265
390
+ },
391
+ {
392
+ "epoch": 0.04912663755458515,
393
+ "grad_norm": 0.09149336814880371,
394
+ "learning_rate": 4.9953132493587344e-05,
395
+ "loss": 0.2300492286682129,
396
+ "step": 270
397
+ },
398
+ {
399
+ "epoch": 0.050036390101892286,
400
+ "grad_norm": 0.08265820890665054,
401
+ "learning_rate": 4.9948519257928034e-05,
402
+ "loss": 0.24246792793273925,
403
+ "step": 275
404
+ },
405
+ {
406
+ "epoch": 0.050946142649199416,
407
+ "grad_norm": 0.10328587144613266,
408
+ "learning_rate": 4.9943689742763534e-05,
409
+ "loss": 0.2367171049118042,
410
+ "step": 280
411
+ },
412
+ {
413
+ "epoch": 0.05185589519650655,
414
+ "grad_norm": 0.0836917981505394,
415
+ "learning_rate": 4.993864398996105e-05,
416
+ "loss": 0.23215813636779786,
417
+ "step": 285
418
+ },
419
+ {
420
+ "epoch": 0.05276564774381368,
421
+ "grad_norm": 0.09475161135196686,
422
+ "learning_rate": 4.99333820432624e-05,
423
+ "loss": 0.2350748062133789,
424
+ "step": 290
425
+ },
426
+ {
427
+ "epoch": 0.05367540029112081,
428
+ "grad_norm": 0.08040128648281097,
429
+ "learning_rate": 4.992790394828355e-05,
430
+ "loss": 0.23253886699676513,
431
+ "step": 295
432
+ },
433
+ {
434
+ "epoch": 0.05458515283842795,
435
+ "grad_norm": 0.08852150291204453,
436
+ "learning_rate": 4.992220975251428e-05,
437
+ "loss": 0.23856515884399415,
438
+ "step": 300
439
+ },
440
+ {
441
+ "epoch": 0.05549490538573508,
442
+ "grad_norm": 0.09565229713916779,
443
+ "learning_rate": 4.991629950531775e-05,
444
+ "loss": 0.23311660289764405,
445
+ "step": 305
446
+ },
447
+ {
448
+ "epoch": 0.05640465793304221,
449
+ "grad_norm": 0.08158160001039505,
450
+ "learning_rate": 4.991017325793009e-05,
451
+ "loss": 0.22467944622039795,
452
+ "step": 310
453
+ },
454
+ {
455
+ "epoch": 0.05731441048034935,
456
+ "grad_norm": 0.07746429741382599,
457
+ "learning_rate": 4.990383106345994e-05,
458
+ "loss": 0.229844069480896,
459
+ "step": 315
460
+ },
461
+ {
462
+ "epoch": 0.05822416302765648,
463
+ "grad_norm": 0.08564355969429016,
464
+ "learning_rate": 4.989727297688797e-05,
465
+ "loss": 0.22414517402648926,
466
+ "step": 320
467
+ },
468
+ {
469
+ "epoch": 0.05913391557496361,
470
+ "grad_norm": 0.07517435401678085,
471
+ "learning_rate": 4.9890499055066435e-05,
472
+ "loss": 0.2236532211303711,
473
+ "step": 325
474
+ },
475
+ {
476
+ "epoch": 0.060043668122270744,
477
+ "grad_norm": 0.111734539270401,
478
+ "learning_rate": 4.988350935671869e-05,
479
+ "loss": 0.21474847793579102,
480
+ "step": 330
481
+ },
482
+ {
483
+ "epoch": 0.060953420669577874,
484
+ "grad_norm": 0.09906989336013794,
485
+ "learning_rate": 4.987630394243866e-05,
486
+ "loss": 0.23321933746337892,
487
+ "step": 335
488
+ },
489
+ {
490
+ "epoch": 0.06186317321688501,
491
+ "grad_norm": 0.10131457448005676,
492
+ "learning_rate": 4.98688828746903e-05,
493
+ "loss": 0.2310662031173706,
494
+ "step": 340
495
+ },
496
+ {
497
+ "epoch": 0.06277292576419213,
498
+ "grad_norm": 0.09203507006168365,
499
+ "learning_rate": 4.986124621780708e-05,
500
+ "loss": 0.22021169662475587,
501
+ "step": 345
502
+ },
503
+ {
504
+ "epoch": 0.06368267831149928,
505
+ "grad_norm": 0.09505912661552429,
506
+ "learning_rate": 4.9853394037991416e-05,
507
+ "loss": 0.2197155237197876,
508
+ "step": 350
509
+ },
510
+ {
511
+ "epoch": 0.06459243085880641,
512
+ "grad_norm": 0.09038657695055008,
513
+ "learning_rate": 4.984532640331412e-05,
514
+ "loss": 0.22066287994384765,
515
+ "step": 355
516
+ },
517
+ {
518
+ "epoch": 0.06550218340611354,
519
+ "grad_norm": 0.09707064181566238,
520
+ "learning_rate": 4.9837043383713753e-05,
521
+ "loss": 0.22455451488494874,
522
+ "step": 360
523
+ },
524
+ {
525
+ "epoch": 0.06641193595342067,
526
+ "grad_norm": 0.10367228090763092,
527
+ "learning_rate": 4.98285450509961e-05,
528
+ "loss": 0.21993820667266845,
529
+ "step": 365
530
+ },
531
+ {
532
+ "epoch": 0.0673216885007278,
533
+ "grad_norm": 0.12229471653699875,
534
+ "learning_rate": 4.9819831478833456e-05,
535
+ "loss": 0.2168867588043213,
536
+ "step": 370
537
+ },
538
+ {
539
+ "epoch": 0.06823144104803494,
540
+ "grad_norm": 0.0964592918753624,
541
+ "learning_rate": 4.981090274276406e-05,
542
+ "loss": 0.21579203605651856,
543
+ "step": 375
544
+ },
545
+ {
546
+ "epoch": 0.06914119359534207,
547
+ "grad_norm": 0.09400496631860733,
548
+ "learning_rate": 4.980175892019141e-05,
549
+ "loss": 0.20972180366516113,
550
+ "step": 380
551
+ },
552
+ {
553
+ "epoch": 0.0700509461426492,
554
+ "grad_norm": 0.08158645778894424,
555
+ "learning_rate": 4.9792400090383594e-05,
556
+ "loss": 0.22148358821868896,
557
+ "step": 385
558
+ },
559
+ {
560
+ "epoch": 0.07096069868995633,
561
+ "grad_norm": 0.10916394740343094,
562
+ "learning_rate": 4.978282633447261e-05,
563
+ "loss": 0.2214418649673462,
564
+ "step": 390
565
+ },
566
+ {
567
+ "epoch": 0.07187045123726346,
568
+ "grad_norm": 0.11138810962438583,
569
+ "learning_rate": 4.9773037735453636e-05,
570
+ "loss": 0.21814754009246826,
571
+ "step": 395
572
+ },
573
+ {
574
+ "epoch": 0.07278020378457059,
575
+ "grad_norm": 0.10914396494626999,
576
+ "learning_rate": 4.9763034378184365e-05,
577
+ "loss": 0.21310818195343018,
578
+ "step": 400
579
+ },
580
+ {
581
+ "epoch": 0.07368995633187773,
582
+ "grad_norm": 0.1043366864323616,
583
+ "learning_rate": 4.975281634938421e-05,
584
+ "loss": 0.21266789436340333,
585
+ "step": 405
586
+ },
587
+ {
588
+ "epoch": 0.07459970887918486,
589
+ "grad_norm": 0.1036868542432785,
590
+ "learning_rate": 4.9742383737633594e-05,
591
+ "loss": 0.21606721878051757,
592
+ "step": 410
593
+ },
594
+ {
595
+ "epoch": 0.075509461426492,
596
+ "grad_norm": 0.11640442907810211,
597
+ "learning_rate": 4.9731736633373144e-05,
598
+ "loss": 0.21532948017120362,
599
+ "step": 415
600
+ },
601
+ {
602
+ "epoch": 0.07641921397379912,
603
+ "grad_norm": 0.11219926178455353,
604
+ "learning_rate": 4.9720875128902956e-05,
605
+ "loss": 0.2191627025604248,
606
+ "step": 420
607
+ },
608
+ {
609
+ "epoch": 0.07732896652110625,
610
+ "grad_norm": 0.12103637307882309,
611
+ "learning_rate": 4.970979931838176e-05,
612
+ "loss": 0.20938868522644044,
613
+ "step": 425
614
+ },
615
+ {
616
+ "epoch": 0.0782387190684134,
617
+ "grad_norm": 0.13274189829826355,
618
+ "learning_rate": 4.96985092978261e-05,
619
+ "loss": 0.21792960166931152,
620
+ "step": 430
621
+ },
622
+ {
623
+ "epoch": 0.07914847161572053,
624
+ "grad_norm": 0.11164513230323792,
625
+ "learning_rate": 4.968700516510954e-05,
626
+ "loss": 0.2022618055343628,
627
+ "step": 435
628
+ },
629
+ {
630
+ "epoch": 0.08005822416302766,
631
+ "grad_norm": 0.09532847255468369,
632
+ "learning_rate": 4.967528701996174e-05,
633
+ "loss": 0.21255812644958497,
634
+ "step": 440
635
+ },
636
+ {
637
+ "epoch": 0.08096797671033479,
638
+ "grad_norm": 0.10279258340597153,
639
+ "learning_rate": 4.96633549639677e-05,
640
+ "loss": 0.20683050155639648,
641
+ "step": 445
642
+ },
643
+ {
644
+ "epoch": 0.08187772925764192,
645
+ "grad_norm": 0.1257462352514267,
646
+ "learning_rate": 4.965120910056677e-05,
647
+ "loss": 0.21419920921325683,
648
+ "step": 450
649
+ },
650
+ {
651
+ "epoch": 0.08278748180494905,
652
+ "grad_norm": 0.11663137376308441,
653
+ "learning_rate": 4.963884953505186e-05,
654
+ "loss": 0.2072287082672119,
655
+ "step": 455
656
+ },
657
+ {
658
+ "epoch": 0.08369723435225619,
659
+ "grad_norm": 0.10488224029541016,
660
+ "learning_rate": 4.96262763745684e-05,
661
+ "loss": 0.1982678532600403,
662
+ "step": 460
663
+ },
664
+ {
665
+ "epoch": 0.08460698689956332,
666
+ "grad_norm": 0.11801692098379135,
667
+ "learning_rate": 4.961348972811354e-05,
668
+ "loss": 0.20662031173706055,
669
+ "step": 465
670
+ },
671
+ {
672
+ "epoch": 0.08551673944687045,
673
+ "grad_norm": 0.11318827420473099,
674
+ "learning_rate": 4.96004897065351e-05,
675
+ "loss": 0.20947303771972656,
676
+ "step": 470
677
+ },
678
+ {
679
+ "epoch": 0.08642649199417758,
680
+ "grad_norm": 0.13409486413002014,
681
+ "learning_rate": 4.95872764225307e-05,
682
+ "loss": 0.19670876264572143,
683
+ "step": 475
684
+ },
685
+ {
686
+ "epoch": 0.08733624454148471,
687
+ "grad_norm": 0.14440792798995972,
688
+ "learning_rate": 4.957384999064672e-05,
689
+ "loss": 0.19842848777770997,
690
+ "step": 480
691
+ },
692
+ {
693
+ "epoch": 0.08824599708879186,
694
+ "grad_norm": 0.12246996909379959,
695
+ "learning_rate": 4.956021052727731e-05,
696
+ "loss": 0.20318071842193602,
697
+ "step": 485
698
+ },
699
+ {
700
+ "epoch": 0.08915574963609899,
701
+ "grad_norm": 0.13437233865261078,
702
+ "learning_rate": 4.954635815066342e-05,
703
+ "loss": 0.21675212383270265,
704
+ "step": 490
705
+ },
706
+ {
707
+ "epoch": 0.09006550218340612,
708
+ "grad_norm": 0.11109672486782074,
709
+ "learning_rate": 4.9532292980891744e-05,
710
+ "loss": 0.2100757837295532,
711
+ "step": 495
712
+ },
713
+ {
714
+ "epoch": 0.09097525473071325,
715
+ "grad_norm": 0.1388893872499466,
716
+ "learning_rate": 4.9518015139893675e-05,
717
+ "loss": 0.20303285121917725,
718
+ "step": 500
719
+ },
720
+ {
721
+ "epoch": 0.09188500727802038,
722
+ "grad_norm": 0.13239721953868866,
723
+ "learning_rate": 4.950352475144427e-05,
724
+ "loss": 0.2152268409729004,
725
+ "step": 505
726
+ },
727
+ {
728
+ "epoch": 0.0927947598253275,
729
+ "grad_norm": 0.12834979593753815,
730
+ "learning_rate": 4.948882194116119e-05,
731
+ "loss": 0.20799248218536376,
732
+ "step": 510
733
+ },
734
+ {
735
+ "epoch": 0.09370451237263465,
736
+ "grad_norm": 0.11886704713106155,
737
+ "learning_rate": 4.947390683650354e-05,
738
+ "loss": 0.20394976139068605,
739
+ "step": 515
740
+ },
741
+ {
742
+ "epoch": 0.09461426491994178,
743
+ "grad_norm": 0.11398876458406448,
744
+ "learning_rate": 4.945877956677083e-05,
745
+ "loss": 0.2091092586517334,
746
+ "step": 520
747
+ },
748
+ {
749
+ "epoch": 0.09552401746724891,
750
+ "grad_norm": 0.1422540694475174,
751
+ "learning_rate": 4.944344026310186e-05,
752
+ "loss": 0.19564238786697388,
753
+ "step": 525
754
+ },
755
+ {
756
+ "epoch": 0.09643377001455604,
757
+ "grad_norm": 0.11359584331512451,
758
+ "learning_rate": 4.9427889058473535e-05,
759
+ "loss": 0.20493624210357667,
760
+ "step": 530
761
+ },
762
+ {
763
+ "epoch": 0.09734352256186317,
764
+ "grad_norm": 0.11703553050756454,
765
+ "learning_rate": 4.941212608769974e-05,
766
+ "loss": 0.2098615884780884,
767
+ "step": 535
768
+ },
769
+ {
770
+ "epoch": 0.0982532751091703,
771
+ "grad_norm": 0.14552047848701477,
772
+ "learning_rate": 4.939615148743017e-05,
773
+ "loss": 0.20382182598114013,
774
+ "step": 540
775
+ },
776
+ {
777
+ "epoch": 0.09916302765647744,
778
+ "grad_norm": 0.13178016245365143,
779
+ "learning_rate": 4.937996539614914e-05,
780
+ "loss": 0.19901862144470214,
781
+ "step": 545
782
+ },
783
+ {
784
+ "epoch": 0.10007278020378457,
785
+ "grad_norm": 0.635392427444458,
786
+ "learning_rate": 4.936356795417439e-05,
787
+ "loss": 0.20694944858551026,
788
+ "step": 550
789
+ },
790
+ {
791
+ "epoch": 0.1009825327510917,
792
+ "grad_norm": 0.15019077062606812,
793
+ "learning_rate": 4.934695930365586e-05,
794
+ "loss": 0.19313746690750122,
795
+ "step": 555
796
+ },
797
+ {
798
+ "epoch": 0.10189228529839883,
799
+ "grad_norm": 0.12941956520080566,
800
+ "learning_rate": 4.9330139588574474e-05,
801
+ "loss": 0.19671722650527954,
802
+ "step": 560
803
+ },
804
+ {
805
+ "epoch": 0.10280203784570596,
806
+ "grad_norm": 0.13818831741809845,
807
+ "learning_rate": 4.931310895474088e-05,
808
+ "loss": 0.20026786327362062,
809
+ "step": 565
810
+ },
811
+ {
812
+ "epoch": 0.1037117903930131,
813
+ "grad_norm": 0.12011194974184036,
814
+ "learning_rate": 4.929586754979417e-05,
815
+ "loss": 0.1932437539100647,
816
+ "step": 570
817
+ },
818
+ {
819
+ "epoch": 0.10462154294032024,
820
+ "grad_norm": 0.1345364898443222,
821
+ "learning_rate": 4.9278415523200644e-05,
822
+ "loss": 0.20245940685272218,
823
+ "step": 575
824
+ },
825
+ {
826
+ "epoch": 0.10553129548762737,
827
+ "grad_norm": 0.13281017541885376,
828
+ "learning_rate": 4.926075302625247e-05,
829
+ "loss": 0.19864981174468993,
830
+ "step": 580
831
+ },
832
+ {
833
+ "epoch": 0.1064410480349345,
834
+ "grad_norm": 0.13465586304664612,
835
+ "learning_rate": 4.924288021206639e-05,
836
+ "loss": 0.19573183059692384,
837
+ "step": 585
838
+ },
839
+ {
840
+ "epoch": 0.10735080058224163,
841
+ "grad_norm": 0.15225961804389954,
842
+ "learning_rate": 4.9224797235582396e-05,
843
+ "loss": 0.19946801662445068,
844
+ "step": 590
845
+ },
846
+ {
847
+ "epoch": 0.10826055312954876,
848
+ "grad_norm": 0.12816746532917023,
849
+ "learning_rate": 4.92065042535624e-05,
850
+ "loss": 0.19851526021957397,
851
+ "step": 595
852
+ },
853
+ {
854
+ "epoch": 0.1091703056768559,
855
+ "grad_norm": 0.13802853226661682,
856
+ "learning_rate": 4.9188001424588824e-05,
857
+ "loss": 0.19321763515472412,
858
+ "step": 600
859
+ },
860
+ {
861
+ "epoch": 0.11008005822416303,
862
+ "grad_norm": 0.17504797875881195,
863
+ "learning_rate": 4.9169288909063295e-05,
864
+ "loss": 0.2032616138458252,
865
+ "step": 605
866
+ },
867
+ {
868
+ "epoch": 0.11098981077147016,
869
+ "grad_norm": 0.13544194400310516,
870
+ "learning_rate": 4.91503668692052e-05,
871
+ "loss": 0.2011256456375122,
872
+ "step": 610
873
+ },
874
+ {
875
+ "epoch": 0.11189956331877729,
876
+ "grad_norm": 1.3976134061813354,
877
+ "learning_rate": 4.91312354690503e-05,
878
+ "loss": 0.19916868209838867,
879
+ "step": 615
880
+ },
881
+ {
882
+ "epoch": 0.11280931586608442,
883
+ "grad_norm": 0.1465059071779251,
884
+ "learning_rate": 4.91118948744493e-05,
885
+ "loss": 0.19487457275390624,
886
+ "step": 620
887
+ },
888
+ {
889
+ "epoch": 0.11371906841339156,
890
+ "grad_norm": 0.12103168666362762,
891
+ "learning_rate": 4.909234525306645e-05,
892
+ "loss": 0.1907251238822937,
893
+ "step": 625
894
+ },
895
+ {
896
+ "epoch": 0.1146288209606987,
897
+ "grad_norm": 0.12660574913024902,
898
+ "learning_rate": 4.907258677437802e-05,
899
+ "loss": 0.19327253103256226,
900
+ "step": 630
901
+ },
902
+ {
903
+ "epoch": 0.11553857350800582,
904
+ "grad_norm": 0.1347813606262207,
905
+ "learning_rate": 4.90526196096709e-05,
906
+ "loss": 0.19637736082077026,
907
+ "step": 635
908
+ },
909
+ {
910
+ "epoch": 0.11644832605531295,
911
+ "grad_norm": 0.14953652024269104,
912
+ "learning_rate": 4.903244393204107e-05,
913
+ "loss": 0.20325069427490233,
914
+ "step": 640
915
+ },
916
+ {
917
+ "epoch": 0.11735807860262008,
918
+ "grad_norm": 0.13936272263526917,
919
+ "learning_rate": 4.901205991639213e-05,
920
+ "loss": 0.1930275321006775,
921
+ "step": 645
922
+ },
923
+ {
924
+ "epoch": 0.11826783114992721,
925
+ "grad_norm": 0.1448420137166977,
926
+ "learning_rate": 4.899146773943374e-05,
927
+ "loss": 0.20026936531066894,
928
+ "step": 650
929
+ },
930
+ {
931
+ "epoch": 0.11917758369723436,
932
+ "grad_norm": 0.1312534064054489,
933
+ "learning_rate": 4.897066757968014e-05,
934
+ "loss": 0.19062033891677857,
935
+ "step": 655
936
+ },
937
+ {
938
+ "epoch": 0.12008733624454149,
939
+ "grad_norm": 0.13644742965698242,
940
+ "learning_rate": 4.894965961744859e-05,
941
+ "loss": 0.18719595670700073,
942
+ "step": 660
943
+ },
944
+ {
945
+ "epoch": 0.12099708879184862,
946
+ "grad_norm": 0.14276087284088135,
947
+ "learning_rate": 4.892844403485777e-05,
948
+ "loss": 0.19784307479858398,
949
+ "step": 665
950
+ },
951
+ {
952
+ "epoch": 0.12190684133915575,
953
+ "grad_norm": 0.14735399186611176,
954
+ "learning_rate": 4.890702101582623e-05,
955
+ "loss": 0.19163782596588136,
956
+ "step": 670
957
+ },
958
+ {
959
+ "epoch": 0.12281659388646288,
960
+ "grad_norm": 0.15742065012454987,
961
+ "learning_rate": 4.888539074607082e-05,
962
+ "loss": 0.19312986135482788,
963
+ "step": 675
964
+ },
965
+ {
966
+ "epoch": 0.12372634643377002,
967
+ "grad_norm": 0.12917031347751617,
968
+ "learning_rate": 4.8863553413105025e-05,
969
+ "loss": 0.20066320896148682,
970
+ "step": 680
971
+ },
972
+ {
973
+ "epoch": 0.12463609898107715,
974
+ "grad_norm": 0.1484801322221756,
975
+ "learning_rate": 4.884150920623737e-05,
976
+ "loss": 0.20096096992492676,
977
+ "step": 685
978
+ },
979
+ {
980
+ "epoch": 0.12554585152838427,
981
+ "grad_norm": 0.1455296128988266,
982
+ "learning_rate": 4.88192583165698e-05,
983
+ "loss": 0.20518505573272705,
984
+ "step": 690
985
+ },
986
+ {
987
+ "epoch": 0.12645560407569142,
988
+ "grad_norm": 0.14517490565776825,
989
+ "learning_rate": 4.879680093699598e-05,
990
+ "loss": 0.18859238624572755,
991
+ "step": 695
992
+ },
993
+ {
994
+ "epoch": 0.12736535662299855,
995
+ "grad_norm": 0.18778090178966522,
996
+ "learning_rate": 4.877413726219964e-05,
997
+ "loss": 0.197074818611145,
998
+ "step": 700
999
+ },
1000
+ {
1001
+ "epoch": 0.12827510917030568,
1002
+ "grad_norm": 0.13497677445411682,
1003
+ "learning_rate": 4.87512674886529e-05,
1004
+ "loss": 0.18713107109069824,
1005
+ "step": 705
1006
+ },
1007
+ {
1008
+ "epoch": 0.12918486171761281,
1009
+ "grad_norm": 0.12657155096530914,
1010
+ "learning_rate": 4.872819181461455e-05,
1011
+ "loss": 0.1858484387397766,
1012
+ "step": 710
1013
+ },
1014
+ {
1015
+ "epoch": 0.13009461426491994,
1016
+ "grad_norm": 0.11458148807287216,
1017
+ "learning_rate": 4.870491044012834e-05,
1018
+ "loss": 0.18732179403305055,
1019
+ "step": 715
1020
+ },
1021
+ {
1022
+ "epoch": 0.13100436681222707,
1023
+ "grad_norm": 0.13000249862670898,
1024
+ "learning_rate": 4.8681423567021244e-05,
1025
+ "loss": 0.1872936010360718,
1026
+ "step": 720
1027
+ },
1028
+ {
1029
+ "epoch": 0.1319141193595342,
1030
+ "grad_norm": 0.14580890536308289,
1031
+ "learning_rate": 4.865773139890172e-05,
1032
+ "loss": 0.19280019998550416,
1033
+ "step": 725
1034
+ },
1035
+ {
1036
+ "epoch": 0.13282387190684133,
1037
+ "grad_norm": 0.1507277935743332,
1038
+ "learning_rate": 4.8633834141157913e-05,
1039
+ "loss": 0.1898929238319397,
1040
+ "step": 730
1041
+ },
1042
+ {
1043
+ "epoch": 0.13373362445414846,
1044
+ "grad_norm": 0.1418737769126892,
1045
+ "learning_rate": 4.860973200095592e-05,
1046
+ "loss": 0.17926375865936278,
1047
+ "step": 735
1048
+ },
1049
+ {
1050
+ "epoch": 0.1346433770014556,
1051
+ "grad_norm": 0.17151866853237152,
1052
+ "learning_rate": 4.858542518723794e-05,
1053
+ "loss": 0.18963592052459716,
1054
+ "step": 740
1055
+ },
1056
+ {
1057
+ "epoch": 0.13555312954876272,
1058
+ "grad_norm": 0.11162743717432022,
1059
+ "learning_rate": 4.8560913910720535e-05,
1060
+ "loss": 0.19466646909713745,
1061
+ "step": 745
1062
+ },
1063
+ {
1064
+ "epoch": 0.13646288209606988,
1065
+ "grad_norm": 0.15628376603126526,
1066
+ "learning_rate": 4.8536198383892725e-05,
1067
+ "loss": 0.19494034051895143,
1068
+ "step": 750
1069
+ },
1070
+ {
1071
+ "epoch": 0.137372634643377,
1072
+ "grad_norm": 0.18209289014339447,
1073
+ "learning_rate": 4.851127882101421e-05,
1074
+ "loss": 0.18747550249099731,
1075
+ "step": 755
1076
+ },
1077
+ {
1078
+ "epoch": 0.13828238719068414,
1079
+ "grad_norm": 0.14559614658355713,
1080
+ "learning_rate": 4.8486155438113454e-05,
1081
+ "loss": 0.1897158980369568,
1082
+ "step": 760
1083
+ },
1084
+ {
1085
+ "epoch": 0.13919213973799127,
1086
+ "grad_norm": 0.3198587894439697,
1087
+ "learning_rate": 4.846082845298586e-05,
1088
+ "loss": 0.18571001291275024,
1089
+ "step": 765
1090
+ },
1091
+ {
1092
+ "epoch": 0.1401018922852984,
1093
+ "grad_norm": 0.1486678421497345,
1094
+ "learning_rate": 4.843529808519189e-05,
1095
+ "loss": 0.19561930894851684,
1096
+ "step": 770
1097
+ },
1098
+ {
1099
+ "epoch": 0.14101164483260553,
1100
+ "grad_norm": 0.15318170189857483,
1101
+ "learning_rate": 4.840956455605509e-05,
1102
+ "loss": 0.187040114402771,
1103
+ "step": 775
1104
+ },
1105
+ {
1106
+ "epoch": 0.14192139737991266,
1107
+ "grad_norm": 0.13754244148731232,
1108
+ "learning_rate": 4.838362808866025e-05,
1109
+ "loss": 0.18345539569854735,
1110
+ "step": 780
1111
+ },
1112
+ {
1113
+ "epoch": 0.1428311499272198,
1114
+ "grad_norm": 0.12943248450756073,
1115
+ "learning_rate": 4.835748890785143e-05,
1116
+ "loss": 0.1921079397201538,
1117
+ "step": 785
1118
+ },
1119
+ {
1120
+ "epoch": 0.14374090247452692,
1121
+ "grad_norm": 0.110458143055439,
1122
+ "learning_rate": 4.833114724023001e-05,
1123
+ "loss": 0.17927205562591553,
1124
+ "step": 790
1125
+ },
1126
+ {
1127
+ "epoch": 0.14465065502183405,
1128
+ "grad_norm": 0.2421770840883255,
1129
+ "learning_rate": 4.830460331415275e-05,
1130
+ "loss": 0.18317567110061644,
1131
+ "step": 795
1132
+ },
1133
+ {
1134
+ "epoch": 0.14556040756914118,
1135
+ "grad_norm": 0.14752762019634247,
1136
+ "learning_rate": 4.8277857359729787e-05,
1137
+ "loss": 0.1843916058540344,
1138
+ "step": 800
1139
+ },
1140
+ {
1141
+ "epoch": 0.14647016011644834,
1142
+ "grad_norm": 0.15043556690216064,
1143
+ "learning_rate": 4.8250909608822644e-05,
1144
+ "loss": 0.18354393243789674,
1145
+ "step": 805
1146
+ },
1147
+ {
1148
+ "epoch": 0.14737991266375547,
1149
+ "grad_norm": 0.1381794661283493,
1150
+ "learning_rate": 4.822376029504223e-05,
1151
+ "loss": 0.1789781332015991,
1152
+ "step": 810
1153
+ },
1154
+ {
1155
+ "epoch": 0.1482896652110626,
1156
+ "grad_norm": 0.18386174738407135,
1157
+ "learning_rate": 4.819640965374681e-05,
1158
+ "loss": 0.19494292736053467,
1159
+ "step": 815
1160
+ },
1161
+ {
1162
+ "epoch": 0.14919941775836973,
1163
+ "grad_norm": 0.13829593360424042,
1164
+ "learning_rate": 4.816885792203996e-05,
1165
+ "loss": 0.18486063480377196,
1166
+ "step": 820
1167
+ },
1168
+ {
1169
+ "epoch": 0.15010917030567686,
1170
+ "grad_norm": 0.15033291280269623,
1171
+ "learning_rate": 4.814110533876852e-05,
1172
+ "loss": 0.18061509132385253,
1173
+ "step": 825
1174
+ },
1175
+ {
1176
+ "epoch": 0.151018922852984,
1177
+ "grad_norm": 0.17150473594665527,
1178
+ "learning_rate": 4.811315214452051e-05,
1179
+ "loss": 0.18464866876602173,
1180
+ "step": 830
1181
+ },
1182
+ {
1183
+ "epoch": 0.15192867540029112,
1184
+ "grad_norm": 0.15317125618457794,
1185
+ "learning_rate": 4.808499858162307e-05,
1186
+ "loss": 0.1837708592414856,
1187
+ "step": 835
1188
+ },
1189
+ {
1190
+ "epoch": 0.15283842794759825,
1191
+ "grad_norm": 0.2671392560005188,
1192
+ "learning_rate": 4.805664489414031e-05,
1193
+ "loss": 0.19338636398315429,
1194
+ "step": 840
1195
+ },
1196
+ {
1197
+ "epoch": 0.15374818049490538,
1198
+ "grad_norm": 0.14047028124332428,
1199
+ "learning_rate": 4.802809132787125e-05,
1200
+ "loss": 0.17069108486175538,
1201
+ "step": 845
1202
+ },
1203
+ {
1204
+ "epoch": 0.1546579330422125,
1205
+ "grad_norm": 0.1520431935787201,
1206
+ "learning_rate": 4.799933813034768e-05,
1207
+ "loss": 0.18607735633850098,
1208
+ "step": 850
1209
+ },
1210
+ {
1211
+ "epoch": 0.15556768558951964,
1212
+ "grad_norm": 0.17239463329315186,
1213
+ "learning_rate": 4.797038555083197e-05,
1214
+ "loss": 0.18069062232971192,
1215
+ "step": 855
1216
+ },
1217
+ {
1218
+ "epoch": 0.1564774381368268,
1219
+ "grad_norm": 0.1377955675125122,
1220
+ "learning_rate": 4.794123384031495e-05,
1221
+ "loss": 0.18870222568511963,
1222
+ "step": 860
1223
+ },
1224
+ {
1225
+ "epoch": 0.15738719068413393,
1226
+ "grad_norm": 0.15901461243629456,
1227
+ "learning_rate": 4.791188325151373e-05,
1228
+ "loss": 0.18128334283828734,
1229
+ "step": 865
1230
+ },
1231
+ {
1232
+ "epoch": 0.15829694323144106,
1233
+ "grad_norm": 0.14634132385253906,
1234
+ "learning_rate": 4.7882334038869495e-05,
1235
+ "loss": 0.1866163969039917,
1236
+ "step": 870
1237
+ },
1238
+ {
1239
+ "epoch": 0.1592066957787482,
1240
+ "grad_norm": 0.15361061692237854,
1241
+ "learning_rate": 4.785258645854529e-05,
1242
+ "loss": 0.17850807905197144,
1243
+ "step": 875
1244
+ },
1245
+ {
1246
+ "epoch": 0.16011644832605532,
1247
+ "grad_norm": 0.13751649856567383,
1248
+ "learning_rate": 4.782264076842385e-05,
1249
+ "loss": 0.17731113433837892,
1250
+ "step": 880
1251
+ },
1252
+ {
1253
+ "epoch": 0.16102620087336245,
1254
+ "grad_norm": 0.17909638583660126,
1255
+ "learning_rate": 4.7792497228105314e-05,
1256
+ "loss": 0.18344542980194092,
1257
+ "step": 885
1258
+ },
1259
+ {
1260
+ "epoch": 0.16193595342066958,
1261
+ "grad_norm": 0.16038304567337036,
1262
+ "learning_rate": 4.776215609890498e-05,
1263
+ "loss": 0.18868647813796996,
1264
+ "step": 890
1265
+ },
1266
+ {
1267
+ "epoch": 0.1628457059679767,
1268
+ "grad_norm": 0.1653951108455658,
1269
+ "learning_rate": 4.773161764385107e-05,
1270
+ "loss": 0.18614152669906617,
1271
+ "step": 895
1272
+ },
1273
+ {
1274
+ "epoch": 0.16375545851528384,
1275
+ "grad_norm": 0.16193026304244995,
1276
+ "learning_rate": 4.770088212768241e-05,
1277
+ "loss": 0.18564575910568237,
1278
+ "step": 900
1279
+ },
1280
+ {
1281
+ "epoch": 0.16466521106259097,
1282
+ "grad_norm": 0.16048531234264374,
1283
+ "learning_rate": 4.7669949816846173e-05,
1284
+ "loss": 0.18330031633377075,
1285
+ "step": 905
1286
+ },
1287
+ {
1288
+ "epoch": 0.1655749636098981,
1289
+ "grad_norm": 0.1440177708864212,
1290
+ "learning_rate": 4.7638820979495534e-05,
1291
+ "loss": 0.17712442874908446,
1292
+ "step": 910
1293
+ },
1294
+ {
1295
+ "epoch": 0.16648471615720525,
1296
+ "grad_norm": 0.19635969400405884,
1297
+ "learning_rate": 4.760749588548738e-05,
1298
+ "loss": 0.18679027557373046,
1299
+ "step": 915
1300
+ },
1301
+ {
1302
+ "epoch": 0.16739446870451238,
1303
+ "grad_norm": 0.15576541423797607,
1304
+ "learning_rate": 4.757597480637995e-05,
1305
+ "loss": 0.19283764362335204,
1306
+ "step": 920
1307
+ },
1308
+ {
1309
+ "epoch": 0.1683042212518195,
1310
+ "grad_norm": 0.1550331562757492,
1311
+ "learning_rate": 4.7544258015430463e-05,
1312
+ "loss": 0.18269542455673218,
1313
+ "step": 925
1314
+ },
1315
+ {
1316
+ "epoch": 0.16921397379912664,
1317
+ "grad_norm": 0.18369626998901367,
1318
+ "learning_rate": 4.75123457875928e-05,
1319
+ "loss": 0.1697891116142273,
1320
+ "step": 930
1321
+ },
1322
+ {
1323
+ "epoch": 0.17012372634643377,
1324
+ "grad_norm": 0.15266314148902893,
1325
+ "learning_rate": 4.7480238399515074e-05,
1326
+ "loss": 0.18523451089859008,
1327
+ "step": 935
1328
+ },
1329
+ {
1330
+ "epoch": 0.1710334788937409,
1331
+ "grad_norm": 0.16709664463996887,
1332
+ "learning_rate": 4.744793612953724e-05,
1333
+ "loss": 0.1803238034248352,
1334
+ "step": 940
1335
+ },
1336
+ {
1337
+ "epoch": 0.17194323144104803,
1338
+ "grad_norm": 0.14929179847240448,
1339
+ "learning_rate": 4.741543925768872e-05,
1340
+ "loss": 0.1861217737197876,
1341
+ "step": 945
1342
+ },
1343
+ {
1344
+ "epoch": 0.17285298398835516,
1345
+ "grad_norm": 0.1362280696630478,
1346
+ "learning_rate": 4.7382748065685915e-05,
1347
+ "loss": 0.17896100282669067,
1348
+ "step": 950
1349
+ },
1350
+ {
1351
+ "epoch": 0.1737627365356623,
1352
+ "grad_norm": 0.15290239453315735,
1353
+ "learning_rate": 4.734986283692982e-05,
1354
+ "loss": 0.18432788848876952,
1355
+ "step": 955
1356
+ },
1357
+ {
1358
+ "epoch": 0.17467248908296942,
1359
+ "grad_norm": 0.1287035197019577,
1360
+ "learning_rate": 4.73167838565035e-05,
1361
+ "loss": 0.18485682010650634,
1362
+ "step": 960
1363
+ },
1364
+ {
1365
+ "epoch": 0.17558224163027655,
1366
+ "grad_norm": 0.17969627678394318,
1367
+ "learning_rate": 4.728351141116971e-05,
1368
+ "loss": 0.17361557483673096,
1369
+ "step": 965
1370
+ },
1371
+ {
1372
+ "epoch": 0.1764919941775837,
1373
+ "grad_norm": 0.13751201331615448,
1374
+ "learning_rate": 4.7250045789368326e-05,
1375
+ "loss": 0.1731679320335388,
1376
+ "step": 970
1377
+ },
1378
+ {
1379
+ "epoch": 0.17740174672489084,
1380
+ "grad_norm": 0.1603265255689621,
1381
+ "learning_rate": 4.721638728121388e-05,
1382
+ "loss": 0.17308170795440675,
1383
+ "step": 975
1384
+ },
1385
+ {
1386
+ "epoch": 0.17831149927219797,
1387
+ "grad_norm": 0.1592789888381958,
1388
+ "learning_rate": 4.718253617849306e-05,
1389
+ "loss": 0.17534757852554322,
1390
+ "step": 980
1391
+ },
1392
+ {
1393
+ "epoch": 0.1792212518195051,
1394
+ "grad_norm": 0.12727224826812744,
1395
+ "learning_rate": 4.714849277466214e-05,
1396
+ "loss": 0.17817609310150145,
1397
+ "step": 985
1398
+ },
1399
+ {
1400
+ "epoch": 0.18013100436681223,
1401
+ "grad_norm": 0.15401554107666016,
1402
+ "learning_rate": 4.711425736484447e-05,
1403
+ "loss": 0.1733405351638794,
1404
+ "step": 990
1405
+ },
1406
+ {
1407
+ "epoch": 0.18104075691411936,
1408
+ "grad_norm": 0.13253968954086304,
1409
+ "learning_rate": 4.7079830245827906e-05,
1410
+ "loss": 0.17846795320510864,
1411
+ "step": 995
1412
+ },
1413
+ {
1414
+ "epoch": 0.1819505094614265,
1415
+ "grad_norm": 0.21846213936805725,
1416
+ "learning_rate": 4.7045211716062245e-05,
1417
+ "loss": 0.18021599054336548,
1418
+ "step": 1000
1419
+ }
1420
+ ],
1421
+ "logging_steps": 5,
1422
+ "max_steps": 5500,
1423
+ "num_input_tokens_seen": 0,
1424
+ "num_train_epochs": 2,
1425
+ "save_steps": 100,
1426
+ "stateful_callbacks": {
1427
+ "TrainerControl": {
1428
+ "args": {
1429
+ "should_epoch_stop": false,
1430
+ "should_evaluate": false,
1431
+ "should_log": false,
1432
+ "should_save": true,
1433
+ "should_training_stop": false
1434
+ },
1435
+ "attributes": {}
1436
+ }
1437
+ },
1438
+ "total_flos": 5.583006871819799e+17,
1439
+ "train_batch_size": 8,
1440
+ "trial_name": null,
1441
+ "trial_params": null
1442
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
3
+ size 5777
checkpoint-1100/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/gemma-4-E4B-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/gemma-4-E4B-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.19.1
checkpoint-1100/adapter_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma4ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma4.modeling_gemma4",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 16,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "lora_ga_config": null,
27
+ "megatron_config": null,
28
+ "megatron_core": "megatron.core",
29
+ "modules_to_save": null,
30
+ "peft_type": "LORA",
31
+ "peft_version": "0.19.1",
32
+ "qalora_group_size": 16,
33
+ "r": 16,
34
+ "rank_pattern": {},
35
+ "revision": null,
36
+ "target_modules": [
37
+ "gate_proj",
38
+ "v_proj",
39
+ "o_proj",
40
+ "k_proj",
41
+ "up_proj",
42
+ "down_proj",
43
+ "q_proj"
44
+ ],
45
+ "target_parameters": null,
46
+ "task_type": "CAUSAL_LM",
47
+ "trainable_token_indices": null,
48
+ "use_bdlora": null,
49
+ "use_dora": false,
50
+ "use_qalora": false,
51
+ "use_rslora": false
52
+ }
checkpoint-1100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4be3bea2ca3bd38e446c68a30717eb1a31d7d5b77955efe33bf656a8162068a
3
+ size 169741912
checkpoint-1100/chat_template.jinja ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required, filter_keys=false) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- set add_comma = false -%}
6
+ {%- if not filter_keys or key not in standard_keys -%}
7
+ {%- if ns.found_first %},{% endif -%}
8
+ {%- set ns.found_first = true -%}
9
+ {{ key }}:{
10
+ {%- if value['description'] -%}
11
+ description:<|"|>{{ value['description'] }}<|"|>
12
+ {%- set add_comma = true -%}
13
+ {%- endif -%}
14
+ {%- if value['type'] | upper == 'STRING' -%}
15
+ {%- if value['enum'] -%}
16
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
+ enum:{{ format_argument(value['enum']) }}
18
+ {%- endif -%}
19
+ {%- elif value['type'] | upper == 'ARRAY' -%}
20
+ {%- if value['items'] is mapping and value['items'] -%}
21
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
+ items:{
23
+ {%- set ns_items = namespace(found_first=false) -%}
24
+ {%- for item_key, item_value in value['items'] | dictsort -%}
25
+ {%- if item_value is not none -%}
26
+ {%- if ns_items.found_first %},{% endif -%}
27
+ {%- set ns_items.found_first = true -%}
28
+ {%- if item_key == 'properties' -%}
29
+ properties:{
30
+ {%- if item_value is mapping -%}
31
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
32
+ {%- endif -%}
33
+ }
34
+ {%- elif item_key == 'required' -%}
35
+ required:[
36
+ {%- for req_item in item_value -%}
37
+ <|"|>{{- req_item -}}<|"|>
38
+ {%- if not loop.last %},{% endif -%}
39
+ {%- endfor -%}
40
+ ]
41
+ {%- elif item_key == 'type' -%}
42
+ {%- if item_value is string -%}
43
+ type:{{ format_argument(item_value | upper) }}
44
+ {%- else -%}
45
+ type:{{ format_argument(item_value | map('upper') | list) }}
46
+ {%- endif -%}
47
+ {%- else -%}
48
+ {{ item_key }}:{{ format_argument(item_value) }}
49
+ {%- endif -%}
50
+ {%- endif -%}
51
+ {%- endfor -%}
52
+ }
53
+ {%- endif -%}
54
+ {%- endif -%}
55
+ {%- if value['nullable'] %}
56
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
+ nullable:true
58
+ {%- endif -%}
59
+ {%- if value['type'] | upper == 'OBJECT' -%}
60
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
+ properties:{
63
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
+ }
65
+ {%- elif value is mapping -%}
66
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
+ properties:{
68
+ {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
69
+ }
70
+ {%- endif -%}
71
+ {%- if value['required'] -%}
72
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
+ required:[
74
+ {%- for item in value['required'] | default([]) -%}
75
+ <|"|>{{- item -}}<|"|>
76
+ {%- if not loop.last %},{% endif -%}
77
+ {%- endfor -%}
78
+ ]
79
+ {%- endif -%}
80
+ {%- endif -%}
81
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
+ type:<|"|>{{ value['type'] | upper }}<|"|>}
83
+ {%- endif -%}
84
+ {%- endfor -%}
85
+ {%- endmacro -%}
86
+ {%- macro format_function_declaration(tool_data) -%}
87
+ declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
88
+ {%- set params = tool_data['function']['parameters'] -%}
89
+ {%- if params -%}
90
+ ,parameters:{
91
+ {%- if params['properties'] -%}
92
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
93
+ {%- endif -%}
94
+ {%- if params['required'] -%}
95
+ required:[
96
+ {%- for item in params['required'] -%}
97
+ <|"|>{{- item -}}<|"|>
98
+ {{- ',' if not loop.last -}}
99
+ {%- endfor -%}
100
+ ],
101
+ {%- endif -%}
102
+ {%- if params['type'] -%}
103
+ type:<|"|>{{- params['type'] | upper -}}<|"|>}
104
+ {%- endif -%}
105
+ {%- endif -%}
106
+ {%- if 'response' in tool_data['function'] -%}
107
+ {%- set response_declaration = tool_data['function']['response'] -%}
108
+ ,response:{
109
+ {%- if response_declaration['description'] -%}
110
+ description:<|"|>{{- response_declaration['description'] -}}<|"|>,
111
+ {%- endif -%}
112
+ {%- if response_declaration['type'] | upper == 'OBJECT' -%}
113
+ type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
114
+ {%- endif -%}
115
+ {%- endif -%}
116
+ }
117
+ {%- endmacro -%}
118
+ {%- macro format_argument(argument, escape_keys=True) -%}
119
+ {%- if argument is string -%}
120
+ {{- '<|"|>' + argument + '<|"|>' -}}
121
+ {%- elif argument is boolean -%}
122
+ {{- 'true' if argument else 'false' -}}
123
+ {%- elif argument is mapping -%}
124
+ {{- '{' -}}
125
+ {%- set ns = namespace(found_first=false) -%}
126
+ {%- for key, value in argument | dictsort -%}
127
+ {%- if ns.found_first %},{% endif -%}
128
+ {%- set ns.found_first = true -%}
129
+ {%- if escape_keys -%}
130
+ {{- '<|"|>' + key + '<|"|>' -}}
131
+ {%- else -%}
132
+ {{- key -}}
133
+ {%- endif -%}
134
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
135
+ {%- endfor -%}
136
+ {{- '}' -}}
137
+ {%- elif argument is sequence -%}
138
+ {{- '[' -}}
139
+ {%- for item in argument -%}
140
+ {{- format_argument(item, escape_keys=escape_keys) -}}
141
+ {%- if not loop.last %},{% endif -%}
142
+ {%- endfor -%}
143
+ {{- ']' -}}
144
+ {%- else -%}
145
+ {{- argument -}}
146
+ {%- endif -%}
147
+ {%- endmacro -%}
148
+ {%- macro strip_thinking(text) -%}
149
+ {%- set ns = namespace(result='') -%}
150
+ {%- for part in text.split('<channel|>') -%}
151
+ {%- if '<|channel>' in part -%}
152
+ {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
153
+ {%- else -%}
154
+ {%- set ns.result = ns.result + part -%}
155
+ {%- endif -%}
156
+ {%- endfor -%}
157
+ {{- ns.result | trim -}}
158
+ {%- endmacro -%}
159
+
160
+ {%- macro format_tool_response_block(tool_name, response) -%}
161
+ {{- '<|tool_response>' -}}
162
+ {%- if response is mapping -%}
163
+ {{- 'response:' + tool_name + '{' -}}
164
+ {%- for key, value in response | dictsort -%}
165
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
+ {%- if not loop.last %},{% endif -%}
167
+ {%- endfor -%}
168
+ {{- '}' -}}
169
+ {%- else -%}
170
+ {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
+ {%- endif -%}
172
+ {{- '<tool_response|>' -}}
173
+ {%- endmacro -%}
174
+
175
+ {%- set ns = namespace(prev_message_type=None) -%}
176
+ {%- set loop_messages = messages -%}
177
+ {{- bos_token -}}
178
+ {#- Handle System/Tool Definitions Block -#}
179
+ {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
+ {{- '<|turn>system\n' -}}
181
+ {#- Inject Thinking token at the very top of the FIRST system turn -#}
182
+ {%- if enable_thinking is defined and enable_thinking -%}
183
+ {{- '<|think|>\n' -}}
184
+ {%- set ns.prev_message_type = 'think' -%}
185
+ {%- endif -%}
186
+ {%- if messages[0]['role'] in ['system', 'developer'] -%}
187
+ {%- if messages[0]['content'] is string -%}
188
+ {{- messages[0]['content'] | trim -}}
189
+ {%- elif messages[0]['content'] is sequence -%}
190
+ {%- for item in messages[0]['content'] -%}
191
+ {{- item['text'] | trim + ' '-}}
192
+ {%- endfor -%}
193
+ {%- endif -%}
194
+ {%- set loop_messages = messages[1:] -%}
195
+ {%- endif -%}
196
+ {%- if tools -%}
197
+ {%- for tool in tools %}
198
+ {{- '<|tool>' -}}
199
+ {{- format_function_declaration(tool) | trim -}}
200
+ {{- '<tool|>' -}}
201
+ {%- endfor %}
202
+ {%- set ns.prev_message_type = 'tool' -%}
203
+ {%- endif -%}
204
+ {{- '<turn|>\n' -}}
205
+ {%- endif %}
206
+
207
+ {#- Pre-scan: find last user message index for reasoning guard -#}
208
+ {%- set ns_turn = namespace(last_user_idx=-1) -%}
209
+ {%- for i in range(loop_messages | length) -%}
210
+ {%- if loop_messages[i]['role'] == 'user' -%}
211
+ {%- set ns_turn.last_user_idx = i -%}
212
+ {%- endif -%}
213
+ {%- endfor -%}
214
+
215
+ {#- Loop through messages -#}
216
+ {%- for message in loop_messages -%}
217
+ {%- if message['role'] != 'tool' -%}
218
+ {%- set ns.prev_message_type = None -%}
219
+ {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
220
+ {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
221
+ {%- set prev_nt = namespace(role=None, found=false) -%}
222
+ {%- if loop.index0 > 0 -%}
223
+ {%- for j in range(loop.index0 - 1, -1, -1) -%}
224
+ {%- if not prev_nt.found -%}
225
+ {%- if loop_messages[j]['role'] != 'tool' -%}
226
+ {%- set prev_nt.role = loop_messages[j]['role'] -%}
227
+ {%- set prev_nt.found = true -%}
228
+ {%- endif -%}
229
+ {%- endif -%}
230
+ {%- endfor -%}
231
+ {%- endif -%}
232
+ {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
233
+ {%- if not continue_same_model_turn -%}
234
+ {{- '<|turn>' + role + '\n' }}
235
+ {%- endif -%}
236
+
237
+ {#- Render reasoning/reasoning_content as thinking channel -#}
238
+ {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
239
+ {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
240
+ {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
241
+ {%- endif -%}
242
+
243
+ {%- if message['tool_calls'] -%}
244
+ {%- for tool_call in message['tool_calls'] -%}
245
+ {%- set function = tool_call['function'] -%}
246
+ {{- '<|tool_call>call:' + function['name'] + '{' -}}
247
+ {%- if function['arguments'] is mapping -%}
248
+ {%- set ns_args = namespace(found_first=false) -%}
249
+ {%- for key, value in function['arguments'] | dictsort -%}
250
+ {%- if ns_args.found_first %},{% endif -%}
251
+ {%- set ns_args.found_first = true -%}
252
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
253
+ {%- endfor -%}
254
+ {%- elif function['arguments'] is string -%}
255
+ {{- function['arguments'] -}}
256
+ {%- endif -%}
257
+ {{- '}<tool_call|>' -}}
258
+ {%- endfor -%}
259
+ {%- set ns.prev_message_type = 'tool_call' -%}
260
+ {%- endif -%}
261
+
262
+ {%- set ns_tr_out = namespace(flag=false) -%}
263
+ {%- if message.get('tool_responses') -%}
264
+ {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
265
+ {%- for tool_response in message['tool_responses'] -%}
266
+ {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
267
+ {%- set ns_tr_out.flag = true -%}
268
+ {%- set ns.prev_message_type = 'tool_response' -%}
269
+ {%- endfor -%}
270
+ {%- elif message.get('tool_calls') -%}
271
+ {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
272
+ {%- set ns_tool_scan = namespace(stopped=false) -%}
273
+ {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
274
+ {%- if ns_tool_scan.stopped -%}
275
+ {%- elif loop_messages[k]['role'] != 'tool' -%}
276
+ {%- set ns_tool_scan.stopped = true -%}
277
+ {%- else -%}
278
+ {%- set follow = loop_messages[k] -%}
279
+ {#- Resolve tool_call_id to function name -#}
280
+ {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
281
+ {%- for tc in message['tool_calls'] -%}
282
+ {%- if tc.get('id') == follow.get('tool_call_id') -%}
283
+ {%- set ns_tname.name = tc['function']['name'] -%}
284
+ {%- endif -%}
285
+ {%- endfor -%}
286
+ {#- Handle content as string or content-parts array -#}
287
+ {%- set tool_body = follow.get('content') -%}
288
+ {%- if tool_body is string -%}
289
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
290
+ {%- elif tool_body is sequence and tool_body is not string -%}
291
+ {%- set ns_txt = namespace(s='') -%}
292
+ {%- for part in tool_body -%}
293
+ {%- if part.get('type') == 'text' -%}
294
+ {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
295
+ {%- endif -%}
296
+ {%- endfor -%}
297
+ {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
298
+ {%- else -%}
299
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
300
+ {%- endif -%}
301
+ {%- set ns_tr_out.flag = true -%}
302
+ {%- set ns.prev_message_type = 'tool_response' -%}
303
+ {%- endif -%}
304
+ {%- endfor -%}
305
+ {%- endif -%}
306
+
307
+ {%- set captured_content -%}
308
+ {%- if message['content'] is string -%}
309
+ {%- if role == 'model' -%}
310
+ {{- strip_thinking(message['content']) -}}
311
+ {%- else -%}
312
+ {{- message['content'] | trim -}}
313
+ {%- endif -%}
314
+ {%- elif message['content'] is sequence -%}
315
+ {%- for item in message['content'] -%}
316
+ {%- if item['type'] == 'text' -%}
317
+ {%- if role == 'model' -%}
318
+ {{- strip_thinking(item['text']) -}}
319
+ {%- else -%}
320
+ {{- item['text'] | trim -}}
321
+ {%- endif -%}
322
+ {%- elif item['type'] == 'image' -%}
323
+ {{- '<|image|>' -}}
324
+ {%- set ns.prev_message_type = 'image' -%}
325
+ {%- elif item['type'] == 'audio' -%}
326
+ {{- '<|audio|>' -}}
327
+ {%- set ns.prev_message_type = 'audio' -%}
328
+ {%- elif item['type'] == 'video' -%}
329
+ {{- '<|video|>' -}}
330
+ {%- set ns.prev_message_type = 'video' -%}
331
+ {%- endif -%}
332
+ {%- endfor -%}
333
+ {%- endif -%}
334
+ {%- endset -%}
335
+
336
+ {{- captured_content -}}
337
+ {%- set has_content = captured_content | trim | length > 0 -%}
338
+
339
+ {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
340
+ {{- '<|tool_response>' -}}
341
+ {%- elif not (ns_tr_out.flag and not has_content) -%}
342
+ {{- '<turn|>\n' -}}
343
+ {%- endif -%}
344
+ {%- endif -%}
345
+ {%- endfor -%}
346
+
347
+ {%- if add_generation_prompt -%}
348
+ {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
349
+ {{- '<|turn>model\n' -}}
350
+ {%- endif -%}
351
+ {%- endif -%}
checkpoint-1100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66120ce4d55186cce9be5cdf28e030e89994c81dac5711321d07d2b5ce8153e3
3
+ size 72807355
checkpoint-1100/processor_config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_ms_per_token": 40,
3
+ "audio_seq_length": 750,
4
+ "feature_extractor": {
5
+ "dither": 0.0,
6
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
7
+ "feature_size": 128,
8
+ "fft_length": 512,
9
+ "fft_overdrive": false,
10
+ "frame_length": 320,
11
+ "hop_length": 160,
12
+ "input_scale_factor": 1.0,
13
+ "max_frequency": 8000.0,
14
+ "mel_floor": 0.001,
15
+ "min_frequency": 0.0,
16
+ "padding_side": "left",
17
+ "padding_value": 0.0,
18
+ "per_bin_mean": null,
19
+ "per_bin_stddev": null,
20
+ "preemphasis": 0.0,
21
+ "preemphasis_htk_flavor": true,
22
+ "return_attention_mask": true,
23
+ "sampling_rate": 16000
24
+ },
25
+ "image_processor": {
26
+ "do_convert_rgb": true,
27
+ "do_normalize": false,
28
+ "do_rescale": true,
29
+ "do_resize": true,
30
+ "image_mean": [
31
+ 0.0,
32
+ 0.0,
33
+ 0.0
34
+ ],
35
+ "image_processor_type": "Gemma4ImageProcessor",
36
+ "image_seq_length": 280,
37
+ "image_std": [
38
+ 1.0,
39
+ 1.0,
40
+ 1.0
41
+ ],
42
+ "max_soft_tokens": 280,
43
+ "patch_size": 16,
44
+ "pooling_kernel_size": 3,
45
+ "resample": 3,
46
+ "rescale_factor": 0.00392156862745098
47
+ },
48
+ "image_seq_length": 280,
49
+ "processor_class": "Gemma4Processor",
50
+ "video_processor": {
51
+ "do_convert_rgb": true,
52
+ "do_normalize": true,
53
+ "do_rescale": true,
54
+ "do_resize": true,
55
+ "do_sample_frames": true,
56
+ "image_mean": [
57
+ 0.0,
58
+ 0.0,
59
+ 0.0
60
+ ],
61
+ "image_std": [
62
+ 1.0,
63
+ 1.0,
64
+ 1.0
65
+ ],
66
+ "max_soft_tokens": 70,
67
+ "num_frames": 32,
68
+ "patch_size": 16,
69
+ "pooling_kernel_size": 3,
70
+ "resample": 3,
71
+ "rescale_factor": 0.00392156862745098,
72
+ "return_metadata": false,
73
+ "video_processor_type": "Gemma4VideoProcessor"
74
+ }
75
+ }
checkpoint-1100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
+ size 14645
checkpoint-1100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:936724e73ecd7ecf26460f7aeb2b5af5460899f93c78695a46fc00c541454d94
3
+ size 1465
checkpoint-1100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
3
+ size 32169626
checkpoint-1100/tokenizer_config.json ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<turn|>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 131072,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "right",
45
+ "processor_class": "Gemma4Processor",
46
+ "response_schema": {
47
+ "properties": {
48
+ "content": {
49
+ "type": "string"
50
+ },
51
+ "role": {
52
+ "const": "assistant"
53
+ },
54
+ "thinking": {
55
+ "type": "string"
56
+ },
57
+ "tool_calls": {
58
+ "items": {
59
+ "properties": {
60
+ "function": {
61
+ "properties": {
62
+ "arguments": {
63
+ "additionalProperties": {},
64
+ "type": "object",
65
+ "x-parser": "gemma4-tool-call"
66
+ },
67
+ "name": {
68
+ "type": "string"
69
+ }
70
+ },
71
+ "type": "object",
72
+ "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
73
+ },
74
+ "type": {
75
+ "const": "function"
76
+ }
77
+ },
78
+ "type": "object"
79
+ },
80
+ "type": "array",
81
+ "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
82
+ }
83
+ },
84
+ "type": "object",
85
+ "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?P<content>(?:(?!\\<turn\\|\\>)(?!\\<\\|tool_response\\>).)+)?(?:\\<turn\\|\\>|\\<\\|tool_response\\>)?"
86
+ },
87
+ "soc_token": "<|channel>",
88
+ "sot_token": "<|turn>",
89
+ "stc_token": "<|tool_call>",
90
+ "std_token": "<|tool>",
91
+ "str_token": "<|tool_response>",
92
+ "think_token": "<|think|>",
93
+ "tokenizer_class": "GemmaTokenizer",
94
+ "unk_token": "<unk>",
95
+ "added_tokens_decoder": {
96
+ "0": {
97
+ "content": "<pad>",
98
+ "single_word": false,
99
+ "lstrip": false,
100
+ "rstrip": false,
101
+ "normalized": false,
102
+ "special": true
103
+ },
104
+ "1": {
105
+ "content": "<eos>",
106
+ "single_word": false,
107
+ "lstrip": false,
108
+ "rstrip": false,
109
+ "normalized": false,
110
+ "special": true
111
+ },
112
+ "2": {
113
+ "content": "<bos>",
114
+ "single_word": false,
115
+ "lstrip": false,
116
+ "rstrip": false,
117
+ "normalized": false,
118
+ "special": true
119
+ },
120
+ "3": {
121
+ "content": "<unk>",
122
+ "single_word": false,
123
+ "lstrip": false,
124
+ "rstrip": false,
125
+ "normalized": false,
126
+ "special": true
127
+ },
128
+ "4": {
129
+ "content": "<mask>",
130
+ "single_word": false,
131
+ "lstrip": false,
132
+ "rstrip": false,
133
+ "normalized": false,
134
+ "special": true
135
+ },
136
+ "46": {
137
+ "content": "<|tool>",
138
+ "single_word": false,
139
+ "lstrip": false,
140
+ "rstrip": false,
141
+ "normalized": false,
142
+ "special": true
143
+ },
144
+ "47": {
145
+ "content": "<tool|>",
146
+ "single_word": false,
147
+ "lstrip": false,
148
+ "rstrip": false,
149
+ "normalized": false,
150
+ "special": true
151
+ },
152
+ "48": {
153
+ "content": "<|tool_call>",
154
+ "single_word": false,
155
+ "lstrip": false,
156
+ "rstrip": false,
157
+ "normalized": false,
158
+ "special": true
159
+ },
160
+ "49": {
161
+ "content": "<tool_call|>",
162
+ "single_word": false,
163
+ "lstrip": false,
164
+ "rstrip": false,
165
+ "normalized": false,
166
+ "special": true
167
+ },
168
+ "50": {
169
+ "content": "<|tool_response>",
170
+ "single_word": false,
171
+ "lstrip": false,
172
+ "rstrip": false,
173
+ "normalized": false,
174
+ "special": true
175
+ },
176
+ "51": {
177
+ "content": "<tool_response|>",
178
+ "single_word": false,
179
+ "lstrip": false,
180
+ "rstrip": false,
181
+ "normalized": false,
182
+ "special": true
183
+ },
184
+ "52": {
185
+ "content": "<|\"|>",
186
+ "single_word": false,
187
+ "lstrip": false,
188
+ "rstrip": false,
189
+ "normalized": false,
190
+ "special": true
191
+ },
192
+ "98": {
193
+ "content": "<|think|>",
194
+ "single_word": false,
195
+ "lstrip": false,
196
+ "rstrip": false,
197
+ "normalized": false,
198
+ "special": true
199
+ },
200
+ "100": {
201
+ "content": "<|channel>",
202
+ "single_word": false,
203
+ "lstrip": false,
204
+ "rstrip": false,
205
+ "normalized": false,
206
+ "special": true
207
+ },
208
+ "101": {
209
+ "content": "<channel|>",
210
+ "single_word": false,
211
+ "lstrip": false,
212
+ "rstrip": false,
213
+ "normalized": false,
214
+ "special": true
215
+ },
216
+ "105": {
217
+ "content": "<|turn>",
218
+ "single_word": false,
219
+ "lstrip": false,
220
+ "rstrip": false,
221
+ "normalized": false,
222
+ "special": true
223
+ },
224
+ "106": {
225
+ "content": "<turn|>",
226
+ "single_word": false,
227
+ "lstrip": false,
228
+ "rstrip": false,
229
+ "normalized": false,
230
+ "special": true
231
+ },
232
+ "255999": {
233
+ "content": "<|image>",
234
+ "single_word": false,
235
+ "lstrip": false,
236
+ "rstrip": false,
237
+ "normalized": false,
238
+ "special": true
239
+ },
240
+ "256000": {
241
+ "content": "<|audio>",
242
+ "single_word": false,
243
+ "lstrip": false,
244
+ "rstrip": false,
245
+ "normalized": false,
246
+ "special": true
247
+ },
248
+ "258880": {
249
+ "content": "<|image|>",
250
+ "single_word": false,
251
+ "lstrip": false,
252
+ "rstrip": false,
253
+ "normalized": false,
254
+ "special": true
255
+ },
256
+ "258881": {
257
+ "content": "<|audio|>",
258
+ "single_word": false,
259
+ "lstrip": false,
260
+ "rstrip": false,
261
+ "normalized": false,
262
+ "special": true
263
+ },
264
+ "258882": {
265
+ "content": "<image|>",
266
+ "single_word": false,
267
+ "lstrip": false,
268
+ "rstrip": false,
269
+ "normalized": false,
270
+ "special": true
271
+ },
272
+ "258883": {
273
+ "content": "<audio|>",
274
+ "single_word": false,
275
+ "lstrip": false,
276
+ "rstrip": false,
277
+ "normalized": false,
278
+ "special": true
279
+ },
280
+ "258884": {
281
+ "content": "<|video|>",
282
+ "single_word": false,
283
+ "lstrip": false,
284
+ "rstrip": false,
285
+ "normalized": false,
286
+ "special": true
287
+ }
288
+ }
289
+ }
checkpoint-1100/trainer_state.json ADDED
@@ -0,0 +1,1582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.20014556040756915,
6
+ "eval_steps": 100,
7
+ "global_step": 1100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0009097525473071324,
14
+ "grad_norm": 1.0602493286132812,
15
+ "learning_rate": 1.2121212121212122e-06,
16
+ "loss": 1.7156932830810547,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.001819505094614265,
21
+ "grad_norm": 1.1577719449996948,
22
+ "learning_rate": 2.7272727272727272e-06,
23
+ "loss": 1.6629371643066406,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.0027292576419213972,
28
+ "grad_norm": 1.0288419723510742,
29
+ "learning_rate": 4.242424242424243e-06,
30
+ "loss": 1.6706295013427734,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.00363901018922853,
35
+ "grad_norm": 2.129403829574585,
36
+ "learning_rate": 5.7575757575757586e-06,
37
+ "loss": 1.7363752365112304,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.004548762736535662,
42
+ "grad_norm": 1.9468326568603516,
43
+ "learning_rate": 7.272727272727272e-06,
44
+ "loss": 1.7111135482788087,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.0054585152838427945,
49
+ "grad_norm": 1.1269357204437256,
50
+ "learning_rate": 8.787878787878788e-06,
51
+ "loss": 1.6924203872680663,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.006368267831149927,
56
+ "grad_norm": 1.4021248817443848,
57
+ "learning_rate": 1.0303030303030304e-05,
58
+ "loss": 1.658310317993164,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.00727802037845706,
63
+ "grad_norm": 1.313381314277649,
64
+ "learning_rate": 1.1818181818181819e-05,
65
+ "loss": 1.5383296012878418,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.008187772925764192,
70
+ "grad_norm": 2.4359891414642334,
71
+ "learning_rate": 1.3333333333333333e-05,
72
+ "loss": 1.4302565574645996,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.009097525473071324,
77
+ "grad_norm": 1.6459542512893677,
78
+ "learning_rate": 1.484848484848485e-05,
79
+ "loss": 1.2602953910827637,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.010007278020378457,
84
+ "grad_norm": 0.7953159213066101,
85
+ "learning_rate": 1.6363636363636366e-05,
86
+ "loss": 1.204326343536377,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.010917030567685589,
91
+ "grad_norm": 0.5824465155601501,
92
+ "learning_rate": 1.787878787878788e-05,
93
+ "loss": 1.068561840057373,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.011826783114992722,
98
+ "grad_norm": 0.39265626668930054,
99
+ "learning_rate": 1.9393939393939395e-05,
100
+ "loss": 0.9570062637329102,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.012736535662299854,
105
+ "grad_norm": 0.3387283384799957,
106
+ "learning_rate": 2.090909090909091e-05,
107
+ "loss": 0.9454713821411133,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.013646288209606987,
112
+ "grad_norm": 0.3182811141014099,
113
+ "learning_rate": 2.2424242424242424e-05,
114
+ "loss": 0.8901592254638672,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.01455604075691412,
119
+ "grad_norm": 0.2735312879085541,
120
+ "learning_rate": 2.393939393939394e-05,
121
+ "loss": 0.8491583824157715,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.015465793304221253,
126
+ "grad_norm": 0.2376435250043869,
127
+ "learning_rate": 2.5454545454545454e-05,
128
+ "loss": 0.8109179496765136,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.016375545851528384,
133
+ "grad_norm": 0.2161586880683899,
134
+ "learning_rate": 2.696969696969697e-05,
135
+ "loss": 0.76962308883667,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.017285298398835518,
140
+ "grad_norm": 0.19587980210781097,
141
+ "learning_rate": 2.8484848484848486e-05,
142
+ "loss": 0.7301986694335938,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.018195050946142648,
147
+ "grad_norm": 0.20971694588661194,
148
+ "learning_rate": 3e-05,
149
+ "loss": 0.7269618034362793,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.018195050946142648,
154
+ "eval_loss": 2.605874538421631,
155
+ "eval_runtime": 1120.0905,
156
+ "eval_samples_per_second": 33.935,
157
+ "eval_steps_per_second": 8.484,
158
+ "step": 100
159
+ },
160
+ {
161
+ "epoch": 0.01910480349344978,
162
+ "grad_norm": 0.10413152724504471,
163
+ "learning_rate": 3.151515151515151e-05,
164
+ "loss": 0.3250573635101318,
165
+ "step": 105
166
+ },
167
+ {
168
+ "epoch": 0.020014556040756915,
169
+ "grad_norm": 0.09383206814527512,
170
+ "learning_rate": 3.303030303030303e-05,
171
+ "loss": 0.3277724742889404,
172
+ "step": 110
173
+ },
174
+ {
175
+ "epoch": 0.020924308588064048,
176
+ "grad_norm": 0.1195850670337677,
177
+ "learning_rate": 3.454545454545455e-05,
178
+ "loss": 0.3215961217880249,
179
+ "step": 115
180
+ },
181
+ {
182
+ "epoch": 0.021834061135371178,
183
+ "grad_norm": 0.0715397521853447,
184
+ "learning_rate": 3.606060606060606e-05,
185
+ "loss": 0.3120795965194702,
186
+ "step": 120
187
+ },
188
+ {
189
+ "epoch": 0.02274381368267831,
190
+ "grad_norm": 0.068007692694664,
191
+ "learning_rate": 3.757575757575758e-05,
192
+ "loss": 0.2964257955551147,
193
+ "step": 125
194
+ },
195
+ {
196
+ "epoch": 0.023653566229985445,
197
+ "grad_norm": 0.09345484524965286,
198
+ "learning_rate": 3.909090909090909e-05,
199
+ "loss": 0.30776252746582033,
200
+ "step": 130
201
+ },
202
+ {
203
+ "epoch": 0.024563318777292575,
204
+ "grad_norm": 0.05577846243977547,
205
+ "learning_rate": 4.0606060606060606e-05,
206
+ "loss": 0.3180255889892578,
207
+ "step": 135
208
+ },
209
+ {
210
+ "epoch": 0.025473071324599708,
211
+ "grad_norm": 0.05919989198446274,
212
+ "learning_rate": 4.212121212121212e-05,
213
+ "loss": 0.31608285903930666,
214
+ "step": 140
215
+ },
216
+ {
217
+ "epoch": 0.02638282387190684,
218
+ "grad_norm": 0.05644674599170685,
219
+ "learning_rate": 4.3636363636363636e-05,
220
+ "loss": 0.2993780136108398,
221
+ "step": 145
222
+ },
223
+ {
224
+ "epoch": 0.027292576419213975,
225
+ "grad_norm": 0.059986088424921036,
226
+ "learning_rate": 4.515151515151516e-05,
227
+ "loss": 0.2931638479232788,
228
+ "step": 150
229
+ },
230
+ {
231
+ "epoch": 0.028202328966521105,
232
+ "grad_norm": 0.05941484495997429,
233
+ "learning_rate": 4.666666666666667e-05,
234
+ "loss": 0.29284651279449464,
235
+ "step": 155
236
+ },
237
+ {
238
+ "epoch": 0.02911208151382824,
239
+ "grad_norm": 0.0579044483602047,
240
+ "learning_rate": 4.8181818181818186e-05,
241
+ "loss": 0.2927037000656128,
242
+ "step": 160
243
+ },
244
+ {
245
+ "epoch": 0.030021834061135372,
246
+ "grad_norm": 0.061985693871974945,
247
+ "learning_rate": 4.9696969696969694e-05,
248
+ "loss": 0.28671720027923586,
249
+ "step": 165
250
+ },
251
+ {
252
+ "epoch": 0.030931586608442505,
253
+ "grad_norm": 0.05715535953640938,
254
+ "learning_rate": 4.999993064772809e-05,
255
+ "loss": 0.2817929744720459,
256
+ "step": 170
257
+ },
258
+ {
259
+ "epoch": 0.03184133915574964,
260
+ "grad_norm": 0.06549780815839767,
261
+ "learning_rate": 4.999964890478288e-05,
262
+ "loss": 0.27853829860687257,
263
+ "step": 175
264
+ },
265
+ {
266
+ "epoch": 0.03275109170305677,
267
+ "grad_norm": 0.05948757752776146,
268
+ "learning_rate": 4.999915043908795e-05,
269
+ "loss": 0.27522289752960205,
270
+ "step": 180
271
+ },
272
+ {
273
+ "epoch": 0.0336608442503639,
274
+ "grad_norm": 0.06262889504432678,
275
+ "learning_rate": 4.9998435254964515e-05,
276
+ "loss": 0.270997428894043,
277
+ "step": 185
278
+ },
279
+ {
280
+ "epoch": 0.034570596797671035,
281
+ "grad_norm": 0.06916829943656921,
282
+ "learning_rate": 4.999750335861253e-05,
283
+ "loss": 0.2788438558578491,
284
+ "step": 190
285
+ },
286
+ {
287
+ "epoch": 0.035480349344978165,
288
+ "grad_norm": 0.06128217652440071,
289
+ "learning_rate": 4.9996354758110624e-05,
290
+ "loss": 0.25649352073669435,
291
+ "step": 195
292
+ },
293
+ {
294
+ "epoch": 0.036390101892285295,
295
+ "grad_norm": 0.06704027950763702,
296
+ "learning_rate": 4.999498946341606e-05,
297
+ "loss": 0.25619523525238036,
298
+ "step": 200
299
+ },
300
+ {
301
+ "epoch": 0.03729985443959243,
302
+ "grad_norm": 0.061678580939769745,
303
+ "learning_rate": 4.999340748636462e-05,
304
+ "loss": 0.24956226348876953,
305
+ "step": 205
306
+ },
307
+ {
308
+ "epoch": 0.03820960698689956,
309
+ "grad_norm": 0.07328873127698898,
310
+ "learning_rate": 4.999160884067051e-05,
311
+ "loss": 0.26169676780700685,
312
+ "step": 210
313
+ },
314
+ {
315
+ "epoch": 0.0391193595342067,
316
+ "grad_norm": 0.08287990838289261,
317
+ "learning_rate": 4.9989593541926246e-05,
318
+ "loss": 0.2574604034423828,
319
+ "step": 215
320
+ },
321
+ {
322
+ "epoch": 0.04002911208151383,
323
+ "grad_norm": 0.06787359714508057,
324
+ "learning_rate": 4.9987361607602525e-05,
325
+ "loss": 0.25351409912109374,
326
+ "step": 220
327
+ },
328
+ {
329
+ "epoch": 0.04093886462882096,
330
+ "grad_norm": 0.06695502996444702,
331
+ "learning_rate": 4.998491305704805e-05,
332
+ "loss": 0.24522039890289307,
333
+ "step": 225
334
+ },
335
+ {
336
+ "epoch": 0.041848617176128096,
337
+ "grad_norm": 0.08872214704751968,
338
+ "learning_rate": 4.9982247911489375e-05,
339
+ "loss": 0.2581867933273315,
340
+ "step": 230
341
+ },
342
+ {
343
+ "epoch": 0.042758369723435226,
344
+ "grad_norm": 0.07637131959199905,
345
+ "learning_rate": 4.9979366194030743e-05,
346
+ "loss": 0.25569658279418944,
347
+ "step": 235
348
+ },
349
+ {
350
+ "epoch": 0.043668122270742356,
351
+ "grad_norm": 0.08158119022846222,
352
+ "learning_rate": 4.997626792965385e-05,
353
+ "loss": 0.2529409646987915,
354
+ "step": 240
355
+ },
356
+ {
357
+ "epoch": 0.04457787481804949,
358
+ "grad_norm": 0.07529161125421524,
359
+ "learning_rate": 4.997295314521766e-05,
360
+ "loss": 0.24049024581909179,
361
+ "step": 245
362
+ },
363
+ {
364
+ "epoch": 0.04548762736535662,
365
+ "grad_norm": 0.08860139548778534,
366
+ "learning_rate": 4.996942186945813e-05,
367
+ "loss": 0.2490522861480713,
368
+ "step": 250
369
+ },
370
+ {
371
+ "epoch": 0.04639737991266375,
372
+ "grad_norm": 0.0850321501493454,
373
+ "learning_rate": 4.9965674132988005e-05,
374
+ "loss": 0.24180831909179687,
375
+ "step": 255
376
+ },
377
+ {
378
+ "epoch": 0.04730713245997089,
379
+ "grad_norm": 0.07556115090847015,
380
+ "learning_rate": 4.996170996829653e-05,
381
+ "loss": 0.2509631872177124,
382
+ "step": 260
383
+ },
384
+ {
385
+ "epoch": 0.04821688500727802,
386
+ "grad_norm": 0.07971206307411194,
387
+ "learning_rate": 4.995752940974918e-05,
388
+ "loss": 0.24398891925811766,
389
+ "step": 265
390
+ },
391
+ {
392
+ "epoch": 0.04912663755458515,
393
+ "grad_norm": 0.09149336814880371,
394
+ "learning_rate": 4.9953132493587344e-05,
395
+ "loss": 0.2300492286682129,
396
+ "step": 270
397
+ },
398
+ {
399
+ "epoch": 0.050036390101892286,
400
+ "grad_norm": 0.08265820890665054,
401
+ "learning_rate": 4.9948519257928034e-05,
402
+ "loss": 0.24246792793273925,
403
+ "step": 275
404
+ },
405
+ {
406
+ "epoch": 0.050946142649199416,
407
+ "grad_norm": 0.10328587144613266,
408
+ "learning_rate": 4.9943689742763534e-05,
409
+ "loss": 0.2367171049118042,
410
+ "step": 280
411
+ },
412
+ {
413
+ "epoch": 0.05185589519650655,
414
+ "grad_norm": 0.0836917981505394,
415
+ "learning_rate": 4.993864398996105e-05,
416
+ "loss": 0.23215813636779786,
417
+ "step": 285
418
+ },
419
+ {
420
+ "epoch": 0.05276564774381368,
421
+ "grad_norm": 0.09475161135196686,
422
+ "learning_rate": 4.99333820432624e-05,
423
+ "loss": 0.2350748062133789,
424
+ "step": 290
425
+ },
426
+ {
427
+ "epoch": 0.05367540029112081,
428
+ "grad_norm": 0.08040128648281097,
429
+ "learning_rate": 4.992790394828355e-05,
430
+ "loss": 0.23253886699676513,
431
+ "step": 295
432
+ },
433
+ {
434
+ "epoch": 0.05458515283842795,
435
+ "grad_norm": 0.08852150291204453,
436
+ "learning_rate": 4.992220975251428e-05,
437
+ "loss": 0.23856515884399415,
438
+ "step": 300
439
+ },
440
+ {
441
+ "epoch": 0.05549490538573508,
442
+ "grad_norm": 0.09565229713916779,
443
+ "learning_rate": 4.991629950531775e-05,
444
+ "loss": 0.23311660289764405,
445
+ "step": 305
446
+ },
447
+ {
448
+ "epoch": 0.05640465793304221,
449
+ "grad_norm": 0.08158160001039505,
450
+ "learning_rate": 4.991017325793009e-05,
451
+ "loss": 0.22467944622039795,
452
+ "step": 310
453
+ },
454
+ {
455
+ "epoch": 0.05731441048034935,
456
+ "grad_norm": 0.07746429741382599,
457
+ "learning_rate": 4.990383106345994e-05,
458
+ "loss": 0.229844069480896,
459
+ "step": 315
460
+ },
461
+ {
462
+ "epoch": 0.05822416302765648,
463
+ "grad_norm": 0.08564355969429016,
464
+ "learning_rate": 4.989727297688797e-05,
465
+ "loss": 0.22414517402648926,
466
+ "step": 320
467
+ },
468
+ {
469
+ "epoch": 0.05913391557496361,
470
+ "grad_norm": 0.07517435401678085,
471
+ "learning_rate": 4.9890499055066435e-05,
472
+ "loss": 0.2236532211303711,
473
+ "step": 325
474
+ },
475
+ {
476
+ "epoch": 0.060043668122270744,
477
+ "grad_norm": 0.111734539270401,
478
+ "learning_rate": 4.988350935671869e-05,
479
+ "loss": 0.21474847793579102,
480
+ "step": 330
481
+ },
482
+ {
483
+ "epoch": 0.060953420669577874,
484
+ "grad_norm": 0.09906989336013794,
485
+ "learning_rate": 4.987630394243866e-05,
486
+ "loss": 0.23321933746337892,
487
+ "step": 335
488
+ },
489
+ {
490
+ "epoch": 0.06186317321688501,
491
+ "grad_norm": 0.10131457448005676,
492
+ "learning_rate": 4.98688828746903e-05,
493
+ "loss": 0.2310662031173706,
494
+ "step": 340
495
+ },
496
+ {
497
+ "epoch": 0.06277292576419213,
498
+ "grad_norm": 0.09203507006168365,
499
+ "learning_rate": 4.986124621780708e-05,
500
+ "loss": 0.22021169662475587,
501
+ "step": 345
502
+ },
503
+ {
504
+ "epoch": 0.06368267831149928,
505
+ "grad_norm": 0.09505912661552429,
506
+ "learning_rate": 4.9853394037991416e-05,
507
+ "loss": 0.2197155237197876,
508
+ "step": 350
509
+ },
510
+ {
511
+ "epoch": 0.06459243085880641,
512
+ "grad_norm": 0.09038657695055008,
513
+ "learning_rate": 4.984532640331412e-05,
514
+ "loss": 0.22066287994384765,
515
+ "step": 355
516
+ },
517
+ {
518
+ "epoch": 0.06550218340611354,
519
+ "grad_norm": 0.09707064181566238,
520
+ "learning_rate": 4.9837043383713753e-05,
521
+ "loss": 0.22455451488494874,
522
+ "step": 360
523
+ },
524
+ {
525
+ "epoch": 0.06641193595342067,
526
+ "grad_norm": 0.10367228090763092,
527
+ "learning_rate": 4.98285450509961e-05,
528
+ "loss": 0.21993820667266845,
529
+ "step": 365
530
+ },
531
+ {
532
+ "epoch": 0.0673216885007278,
533
+ "grad_norm": 0.12229471653699875,
534
+ "learning_rate": 4.9819831478833456e-05,
535
+ "loss": 0.2168867588043213,
536
+ "step": 370
537
+ },
538
+ {
539
+ "epoch": 0.06823144104803494,
540
+ "grad_norm": 0.0964592918753624,
541
+ "learning_rate": 4.981090274276406e-05,
542
+ "loss": 0.21579203605651856,
543
+ "step": 375
544
+ },
545
+ {
546
+ "epoch": 0.06914119359534207,
547
+ "grad_norm": 0.09400496631860733,
548
+ "learning_rate": 4.980175892019141e-05,
549
+ "loss": 0.20972180366516113,
550
+ "step": 380
551
+ },
552
+ {
553
+ "epoch": 0.0700509461426492,
554
+ "grad_norm": 0.08158645778894424,
555
+ "learning_rate": 4.9792400090383594e-05,
556
+ "loss": 0.22148358821868896,
557
+ "step": 385
558
+ },
559
+ {
560
+ "epoch": 0.07096069868995633,
561
+ "grad_norm": 0.10916394740343094,
562
+ "learning_rate": 4.978282633447261e-05,
563
+ "loss": 0.2214418649673462,
564
+ "step": 390
565
+ },
566
+ {
567
+ "epoch": 0.07187045123726346,
568
+ "grad_norm": 0.11138810962438583,
569
+ "learning_rate": 4.9773037735453636e-05,
570
+ "loss": 0.21814754009246826,
571
+ "step": 395
572
+ },
573
+ {
574
+ "epoch": 0.07278020378457059,
575
+ "grad_norm": 0.10914396494626999,
576
+ "learning_rate": 4.9763034378184365e-05,
577
+ "loss": 0.21310818195343018,
578
+ "step": 400
579
+ },
580
+ {
581
+ "epoch": 0.07368995633187773,
582
+ "grad_norm": 0.1043366864323616,
583
+ "learning_rate": 4.975281634938421e-05,
584
+ "loss": 0.21266789436340333,
585
+ "step": 405
586
+ },
587
+ {
588
+ "epoch": 0.07459970887918486,
589
+ "grad_norm": 0.1036868542432785,
590
+ "learning_rate": 4.9742383737633594e-05,
591
+ "loss": 0.21606721878051757,
592
+ "step": 410
593
+ },
594
+ {
595
+ "epoch": 0.075509461426492,
596
+ "grad_norm": 0.11640442907810211,
597
+ "learning_rate": 4.9731736633373144e-05,
598
+ "loss": 0.21532948017120362,
599
+ "step": 415
600
+ },
601
+ {
602
+ "epoch": 0.07641921397379912,
603
+ "grad_norm": 0.11219926178455353,
604
+ "learning_rate": 4.9720875128902956e-05,
605
+ "loss": 0.2191627025604248,
606
+ "step": 420
607
+ },
608
+ {
609
+ "epoch": 0.07732896652110625,
610
+ "grad_norm": 0.12103637307882309,
611
+ "learning_rate": 4.970979931838176e-05,
612
+ "loss": 0.20938868522644044,
613
+ "step": 425
614
+ },
615
+ {
616
+ "epoch": 0.0782387190684134,
617
+ "grad_norm": 0.13274189829826355,
618
+ "learning_rate": 4.96985092978261e-05,
619
+ "loss": 0.21792960166931152,
620
+ "step": 430
621
+ },
622
+ {
623
+ "epoch": 0.07914847161572053,
624
+ "grad_norm": 0.11164513230323792,
625
+ "learning_rate": 4.968700516510954e-05,
626
+ "loss": 0.2022618055343628,
627
+ "step": 435
628
+ },
629
+ {
630
+ "epoch": 0.08005822416302766,
631
+ "grad_norm": 0.09532847255468369,
632
+ "learning_rate": 4.967528701996174e-05,
633
+ "loss": 0.21255812644958497,
634
+ "step": 440
635
+ },
636
+ {
637
+ "epoch": 0.08096797671033479,
638
+ "grad_norm": 0.10279258340597153,
639
+ "learning_rate": 4.96633549639677e-05,
640
+ "loss": 0.20683050155639648,
641
+ "step": 445
642
+ },
643
+ {
644
+ "epoch": 0.08187772925764192,
645
+ "grad_norm": 0.1257462352514267,
646
+ "learning_rate": 4.965120910056677e-05,
647
+ "loss": 0.21419920921325683,
648
+ "step": 450
649
+ },
650
+ {
651
+ "epoch": 0.08278748180494905,
652
+ "grad_norm": 0.11663137376308441,
653
+ "learning_rate": 4.963884953505186e-05,
654
+ "loss": 0.2072287082672119,
655
+ "step": 455
656
+ },
657
+ {
658
+ "epoch": 0.08369723435225619,
659
+ "grad_norm": 0.10488224029541016,
660
+ "learning_rate": 4.96262763745684e-05,
661
+ "loss": 0.1982678532600403,
662
+ "step": 460
663
+ },
664
+ {
665
+ "epoch": 0.08460698689956332,
666
+ "grad_norm": 0.11801692098379135,
667
+ "learning_rate": 4.961348972811354e-05,
668
+ "loss": 0.20662031173706055,
669
+ "step": 465
670
+ },
671
+ {
672
+ "epoch": 0.08551673944687045,
673
+ "grad_norm": 0.11318827420473099,
674
+ "learning_rate": 4.96004897065351e-05,
675
+ "loss": 0.20947303771972656,
676
+ "step": 470
677
+ },
678
+ {
679
+ "epoch": 0.08642649199417758,
680
+ "grad_norm": 0.13409486413002014,
681
+ "learning_rate": 4.95872764225307e-05,
682
+ "loss": 0.19670876264572143,
683
+ "step": 475
684
+ },
685
+ {
686
+ "epoch": 0.08733624454148471,
687
+ "grad_norm": 0.14440792798995972,
688
+ "learning_rate": 4.957384999064672e-05,
689
+ "loss": 0.19842848777770997,
690
+ "step": 480
691
+ },
692
+ {
693
+ "epoch": 0.08824599708879186,
694
+ "grad_norm": 0.12246996909379959,
695
+ "learning_rate": 4.956021052727731e-05,
696
+ "loss": 0.20318071842193602,
697
+ "step": 485
698
+ },
699
+ {
700
+ "epoch": 0.08915574963609899,
701
+ "grad_norm": 0.13437233865261078,
702
+ "learning_rate": 4.954635815066342e-05,
703
+ "loss": 0.21675212383270265,
704
+ "step": 490
705
+ },
706
+ {
707
+ "epoch": 0.09006550218340612,
708
+ "grad_norm": 0.11109672486782074,
709
+ "learning_rate": 4.9532292980891744e-05,
710
+ "loss": 0.2100757837295532,
711
+ "step": 495
712
+ },
713
+ {
714
+ "epoch": 0.09097525473071325,
715
+ "grad_norm": 0.1388893872499466,
716
+ "learning_rate": 4.9518015139893675e-05,
717
+ "loss": 0.20303285121917725,
718
+ "step": 500
719
+ },
720
+ {
721
+ "epoch": 0.09188500727802038,
722
+ "grad_norm": 0.13239721953868866,
723
+ "learning_rate": 4.950352475144427e-05,
724
+ "loss": 0.2152268409729004,
725
+ "step": 505
726
+ },
727
+ {
728
+ "epoch": 0.0927947598253275,
729
+ "grad_norm": 0.12834979593753815,
730
+ "learning_rate": 4.948882194116119e-05,
731
+ "loss": 0.20799248218536376,
732
+ "step": 510
733
+ },
734
+ {
735
+ "epoch": 0.09370451237263465,
736
+ "grad_norm": 0.11886704713106155,
737
+ "learning_rate": 4.947390683650354e-05,
738
+ "loss": 0.20394976139068605,
739
+ "step": 515
740
+ },
741
+ {
742
+ "epoch": 0.09461426491994178,
743
+ "grad_norm": 0.11398876458406448,
744
+ "learning_rate": 4.945877956677083e-05,
745
+ "loss": 0.2091092586517334,
746
+ "step": 520
747
+ },
748
+ {
749
+ "epoch": 0.09552401746724891,
750
+ "grad_norm": 0.1422540694475174,
751
+ "learning_rate": 4.944344026310186e-05,
752
+ "loss": 0.19564238786697388,
753
+ "step": 525
754
+ },
755
+ {
756
+ "epoch": 0.09643377001455604,
757
+ "grad_norm": 0.11359584331512451,
758
+ "learning_rate": 4.9427889058473535e-05,
759
+ "loss": 0.20493624210357667,
760
+ "step": 530
761
+ },
762
+ {
763
+ "epoch": 0.09734352256186317,
764
+ "grad_norm": 0.11703553050756454,
765
+ "learning_rate": 4.941212608769974e-05,
766
+ "loss": 0.2098615884780884,
767
+ "step": 535
768
+ },
769
+ {
770
+ "epoch": 0.0982532751091703,
771
+ "grad_norm": 0.14552047848701477,
772
+ "learning_rate": 4.939615148743017e-05,
773
+ "loss": 0.20382182598114013,
774
+ "step": 540
775
+ },
776
+ {
777
+ "epoch": 0.09916302765647744,
778
+ "grad_norm": 0.13178016245365143,
779
+ "learning_rate": 4.937996539614914e-05,
780
+ "loss": 0.19901862144470214,
781
+ "step": 545
782
+ },
783
+ {
784
+ "epoch": 0.10007278020378457,
785
+ "grad_norm": 0.635392427444458,
786
+ "learning_rate": 4.936356795417439e-05,
787
+ "loss": 0.20694944858551026,
788
+ "step": 550
789
+ },
790
+ {
791
+ "epoch": 0.1009825327510917,
792
+ "grad_norm": 0.15019077062606812,
793
+ "learning_rate": 4.934695930365586e-05,
794
+ "loss": 0.19313746690750122,
795
+ "step": 555
796
+ },
797
+ {
798
+ "epoch": 0.10189228529839883,
799
+ "grad_norm": 0.12941956520080566,
800
+ "learning_rate": 4.9330139588574474e-05,
801
+ "loss": 0.19671722650527954,
802
+ "step": 560
803
+ },
804
+ {
805
+ "epoch": 0.10280203784570596,
806
+ "grad_norm": 0.13818831741809845,
807
+ "learning_rate": 4.931310895474088e-05,
808
+ "loss": 0.20026786327362062,
809
+ "step": 565
810
+ },
811
+ {
812
+ "epoch": 0.1037117903930131,
813
+ "grad_norm": 0.12011194974184036,
814
+ "learning_rate": 4.929586754979417e-05,
815
+ "loss": 0.1932437539100647,
816
+ "step": 570
817
+ },
818
+ {
819
+ "epoch": 0.10462154294032024,
820
+ "grad_norm": 0.1345364898443222,
821
+ "learning_rate": 4.9278415523200644e-05,
822
+ "loss": 0.20245940685272218,
823
+ "step": 575
824
+ },
825
+ {
826
+ "epoch": 0.10553129548762737,
827
+ "grad_norm": 0.13281017541885376,
828
+ "learning_rate": 4.926075302625247e-05,
829
+ "loss": 0.19864981174468993,
830
+ "step": 580
831
+ },
832
+ {
833
+ "epoch": 0.1064410480349345,
834
+ "grad_norm": 0.13465586304664612,
835
+ "learning_rate": 4.924288021206639e-05,
836
+ "loss": 0.19573183059692384,
837
+ "step": 585
838
+ },
839
+ {
840
+ "epoch": 0.10735080058224163,
841
+ "grad_norm": 0.15225961804389954,
842
+ "learning_rate": 4.9224797235582396e-05,
843
+ "loss": 0.19946801662445068,
844
+ "step": 590
845
+ },
846
+ {
847
+ "epoch": 0.10826055312954876,
848
+ "grad_norm": 0.12816746532917023,
849
+ "learning_rate": 4.92065042535624e-05,
850
+ "loss": 0.19851526021957397,
851
+ "step": 595
852
+ },
853
+ {
854
+ "epoch": 0.1091703056768559,
855
+ "grad_norm": 0.13802853226661682,
856
+ "learning_rate": 4.9188001424588824e-05,
857
+ "loss": 0.19321763515472412,
858
+ "step": 600
859
+ },
860
+ {
861
+ "epoch": 0.11008005822416303,
862
+ "grad_norm": 0.17504797875881195,
863
+ "learning_rate": 4.9169288909063295e-05,
864
+ "loss": 0.2032616138458252,
865
+ "step": 605
866
+ },
867
+ {
868
+ "epoch": 0.11098981077147016,
869
+ "grad_norm": 0.13544194400310516,
870
+ "learning_rate": 4.91503668692052e-05,
871
+ "loss": 0.2011256456375122,
872
+ "step": 610
873
+ },
874
+ {
875
+ "epoch": 0.11189956331877729,
876
+ "grad_norm": 1.3976134061813354,
877
+ "learning_rate": 4.91312354690503e-05,
878
+ "loss": 0.19916868209838867,
879
+ "step": 615
880
+ },
881
+ {
882
+ "epoch": 0.11280931586608442,
883
+ "grad_norm": 0.1465059071779251,
884
+ "learning_rate": 4.91118948744493e-05,
885
+ "loss": 0.19487457275390624,
886
+ "step": 620
887
+ },
888
+ {
889
+ "epoch": 0.11371906841339156,
890
+ "grad_norm": 0.12103168666362762,
891
+ "learning_rate": 4.909234525306645e-05,
892
+ "loss": 0.1907251238822937,
893
+ "step": 625
894
+ },
895
+ {
896
+ "epoch": 0.1146288209606987,
897
+ "grad_norm": 0.12660574913024902,
898
+ "learning_rate": 4.907258677437802e-05,
899
+ "loss": 0.19327253103256226,
900
+ "step": 630
901
+ },
902
+ {
903
+ "epoch": 0.11553857350800582,
904
+ "grad_norm": 0.1347813606262207,
905
+ "learning_rate": 4.90526196096709e-05,
906
+ "loss": 0.19637736082077026,
907
+ "step": 635
908
+ },
909
+ {
910
+ "epoch": 0.11644832605531295,
911
+ "grad_norm": 0.14953652024269104,
912
+ "learning_rate": 4.903244393204107e-05,
913
+ "loss": 0.20325069427490233,
914
+ "step": 640
915
+ },
916
+ {
917
+ "epoch": 0.11735807860262008,
918
+ "grad_norm": 0.13936272263526917,
919
+ "learning_rate": 4.901205991639213e-05,
920
+ "loss": 0.1930275321006775,
921
+ "step": 645
922
+ },
923
+ {
924
+ "epoch": 0.11826783114992721,
925
+ "grad_norm": 0.1448420137166977,
926
+ "learning_rate": 4.899146773943374e-05,
927
+ "loss": 0.20026936531066894,
928
+ "step": 650
929
+ },
930
+ {
931
+ "epoch": 0.11917758369723436,
932
+ "grad_norm": 0.1312534064054489,
933
+ "learning_rate": 4.897066757968014e-05,
934
+ "loss": 0.19062033891677857,
935
+ "step": 655
936
+ },
937
+ {
938
+ "epoch": 0.12008733624454149,
939
+ "grad_norm": 0.13644742965698242,
940
+ "learning_rate": 4.894965961744859e-05,
941
+ "loss": 0.18719595670700073,
942
+ "step": 660
943
+ },
944
+ {
945
+ "epoch": 0.12099708879184862,
946
+ "grad_norm": 0.14276087284088135,
947
+ "learning_rate": 4.892844403485777e-05,
948
+ "loss": 0.19784307479858398,
949
+ "step": 665
950
+ },
951
+ {
952
+ "epoch": 0.12190684133915575,
953
+ "grad_norm": 0.14735399186611176,
954
+ "learning_rate": 4.890702101582623e-05,
955
+ "loss": 0.19163782596588136,
956
+ "step": 670
957
+ },
958
+ {
959
+ "epoch": 0.12281659388646288,
960
+ "grad_norm": 0.15742065012454987,
961
+ "learning_rate": 4.888539074607082e-05,
962
+ "loss": 0.19312986135482788,
963
+ "step": 675
964
+ },
965
+ {
966
+ "epoch": 0.12372634643377002,
967
+ "grad_norm": 0.12917031347751617,
968
+ "learning_rate": 4.8863553413105025e-05,
969
+ "loss": 0.20066320896148682,
970
+ "step": 680
971
+ },
972
+ {
973
+ "epoch": 0.12463609898107715,
974
+ "grad_norm": 0.1484801322221756,
975
+ "learning_rate": 4.884150920623737e-05,
976
+ "loss": 0.20096096992492676,
977
+ "step": 685
978
+ },
979
+ {
980
+ "epoch": 0.12554585152838427,
981
+ "grad_norm": 0.1455296128988266,
982
+ "learning_rate": 4.88192583165698e-05,
983
+ "loss": 0.20518505573272705,
984
+ "step": 690
985
+ },
986
+ {
987
+ "epoch": 0.12645560407569142,
988
+ "grad_norm": 0.14517490565776825,
989
+ "learning_rate": 4.879680093699598e-05,
990
+ "loss": 0.18859238624572755,
991
+ "step": 695
992
+ },
993
+ {
994
+ "epoch": 0.12736535662299855,
995
+ "grad_norm": 0.18778090178966522,
996
+ "learning_rate": 4.877413726219964e-05,
997
+ "loss": 0.197074818611145,
998
+ "step": 700
999
+ },
1000
+ {
1001
+ "epoch": 0.12827510917030568,
1002
+ "grad_norm": 0.13497677445411682,
1003
+ "learning_rate": 4.87512674886529e-05,
1004
+ "loss": 0.18713107109069824,
1005
+ "step": 705
1006
+ },
1007
+ {
1008
+ "epoch": 0.12918486171761281,
1009
+ "grad_norm": 0.12657155096530914,
1010
+ "learning_rate": 4.872819181461455e-05,
1011
+ "loss": 0.1858484387397766,
1012
+ "step": 710
1013
+ },
1014
+ {
1015
+ "epoch": 0.13009461426491994,
1016
+ "grad_norm": 0.11458148807287216,
1017
+ "learning_rate": 4.870491044012834e-05,
1018
+ "loss": 0.18732179403305055,
1019
+ "step": 715
1020
+ },
1021
+ {
1022
+ "epoch": 0.13100436681222707,
1023
+ "grad_norm": 0.13000249862670898,
1024
+ "learning_rate": 4.8681423567021244e-05,
1025
+ "loss": 0.1872936010360718,
1026
+ "step": 720
1027
+ },
1028
+ {
1029
+ "epoch": 0.1319141193595342,
1030
+ "grad_norm": 0.14580890536308289,
1031
+ "learning_rate": 4.865773139890172e-05,
1032
+ "loss": 0.19280019998550416,
1033
+ "step": 725
1034
+ },
1035
+ {
1036
+ "epoch": 0.13282387190684133,
1037
+ "grad_norm": 0.1507277935743332,
1038
+ "learning_rate": 4.8633834141157913e-05,
1039
+ "loss": 0.1898929238319397,
1040
+ "step": 730
1041
+ },
1042
+ {
1043
+ "epoch": 0.13373362445414846,
1044
+ "grad_norm": 0.1418737769126892,
1045
+ "learning_rate": 4.860973200095592e-05,
1046
+ "loss": 0.17926375865936278,
1047
+ "step": 735
1048
+ },
1049
+ {
1050
+ "epoch": 0.1346433770014556,
1051
+ "grad_norm": 0.17151866853237152,
1052
+ "learning_rate": 4.858542518723794e-05,
1053
+ "loss": 0.18963592052459716,
1054
+ "step": 740
1055
+ },
1056
+ {
1057
+ "epoch": 0.13555312954876272,
1058
+ "grad_norm": 0.11162743717432022,
1059
+ "learning_rate": 4.8560913910720535e-05,
1060
+ "loss": 0.19466646909713745,
1061
+ "step": 745
1062
+ },
1063
+ {
1064
+ "epoch": 0.13646288209606988,
1065
+ "grad_norm": 0.15628376603126526,
1066
+ "learning_rate": 4.8536198383892725e-05,
1067
+ "loss": 0.19494034051895143,
1068
+ "step": 750
1069
+ },
1070
+ {
1071
+ "epoch": 0.137372634643377,
1072
+ "grad_norm": 0.18209289014339447,
1073
+ "learning_rate": 4.851127882101421e-05,
1074
+ "loss": 0.18747550249099731,
1075
+ "step": 755
1076
+ },
1077
+ {
1078
+ "epoch": 0.13828238719068414,
1079
+ "grad_norm": 0.14559614658355713,
1080
+ "learning_rate": 4.8486155438113454e-05,
1081
+ "loss": 0.1897158980369568,
1082
+ "step": 760
1083
+ },
1084
+ {
1085
+ "epoch": 0.13919213973799127,
1086
+ "grad_norm": 0.3198587894439697,
1087
+ "learning_rate": 4.846082845298586e-05,
1088
+ "loss": 0.18571001291275024,
1089
+ "step": 765
1090
+ },
1091
+ {
1092
+ "epoch": 0.1401018922852984,
1093
+ "grad_norm": 0.1486678421497345,
1094
+ "learning_rate": 4.843529808519189e-05,
1095
+ "loss": 0.19561930894851684,
1096
+ "step": 770
1097
+ },
1098
+ {
1099
+ "epoch": 0.14101164483260553,
1100
+ "grad_norm": 0.15318170189857483,
1101
+ "learning_rate": 4.840956455605509e-05,
1102
+ "loss": 0.187040114402771,
1103
+ "step": 775
1104
+ },
1105
+ {
1106
+ "epoch": 0.14192139737991266,
1107
+ "grad_norm": 0.13754244148731232,
1108
+ "learning_rate": 4.838362808866025e-05,
1109
+ "loss": 0.18345539569854735,
1110
+ "step": 780
1111
+ },
1112
+ {
1113
+ "epoch": 0.1428311499272198,
1114
+ "grad_norm": 0.12943248450756073,
1115
+ "learning_rate": 4.835748890785143e-05,
1116
+ "loss": 0.1921079397201538,
1117
+ "step": 785
1118
+ },
1119
+ {
1120
+ "epoch": 0.14374090247452692,
1121
+ "grad_norm": 0.110458143055439,
1122
+ "learning_rate": 4.833114724023001e-05,
1123
+ "loss": 0.17927205562591553,
1124
+ "step": 790
1125
+ },
1126
+ {
1127
+ "epoch": 0.14465065502183405,
1128
+ "grad_norm": 0.2421770840883255,
1129
+ "learning_rate": 4.830460331415275e-05,
1130
+ "loss": 0.18317567110061644,
1131
+ "step": 795
1132
+ },
1133
+ {
1134
+ "epoch": 0.14556040756914118,
1135
+ "grad_norm": 0.14752762019634247,
1136
+ "learning_rate": 4.8277857359729787e-05,
1137
+ "loss": 0.1843916058540344,
1138
+ "step": 800
1139
+ },
1140
+ {
1141
+ "epoch": 0.14647016011644834,
1142
+ "grad_norm": 0.15043556690216064,
1143
+ "learning_rate": 4.8250909608822644e-05,
1144
+ "loss": 0.18354393243789674,
1145
+ "step": 805
1146
+ },
1147
+ {
1148
+ "epoch": 0.14737991266375547,
1149
+ "grad_norm": 0.1381794661283493,
1150
+ "learning_rate": 4.822376029504223e-05,
1151
+ "loss": 0.1789781332015991,
1152
+ "step": 810
1153
+ },
1154
+ {
1155
+ "epoch": 0.1482896652110626,
1156
+ "grad_norm": 0.18386174738407135,
1157
+ "learning_rate": 4.819640965374681e-05,
1158
+ "loss": 0.19494292736053467,
1159
+ "step": 815
1160
+ },
1161
+ {
1162
+ "epoch": 0.14919941775836973,
1163
+ "grad_norm": 0.13829593360424042,
1164
+ "learning_rate": 4.816885792203996e-05,
1165
+ "loss": 0.18486063480377196,
1166
+ "step": 820
1167
+ },
1168
+ {
1169
+ "epoch": 0.15010917030567686,
1170
+ "grad_norm": 0.15033291280269623,
1171
+ "learning_rate": 4.814110533876852e-05,
1172
+ "loss": 0.18061509132385253,
1173
+ "step": 825
1174
+ },
1175
+ {
1176
+ "epoch": 0.151018922852984,
1177
+ "grad_norm": 0.17150473594665527,
1178
+ "learning_rate": 4.811315214452051e-05,
1179
+ "loss": 0.18464866876602173,
1180
+ "step": 830
1181
+ },
1182
+ {
1183
+ "epoch": 0.15192867540029112,
1184
+ "grad_norm": 0.15317125618457794,
1185
+ "learning_rate": 4.808499858162307e-05,
1186
+ "loss": 0.1837708592414856,
1187
+ "step": 835
1188
+ },
1189
+ {
1190
+ "epoch": 0.15283842794759825,
1191
+ "grad_norm": 0.2671392560005188,
1192
+ "learning_rate": 4.805664489414031e-05,
1193
+ "loss": 0.19338636398315429,
1194
+ "step": 840
1195
+ },
1196
+ {
1197
+ "epoch": 0.15374818049490538,
1198
+ "grad_norm": 0.14047028124332428,
1199
+ "learning_rate": 4.802809132787125e-05,
1200
+ "loss": 0.17069108486175538,
1201
+ "step": 845
1202
+ },
1203
+ {
1204
+ "epoch": 0.1546579330422125,
1205
+ "grad_norm": 0.1520431935787201,
1206
+ "learning_rate": 4.799933813034768e-05,
1207
+ "loss": 0.18607735633850098,
1208
+ "step": 850
1209
+ },
1210
+ {
1211
+ "epoch": 0.15556768558951964,
1212
+ "grad_norm": 0.17239463329315186,
1213
+ "learning_rate": 4.797038555083197e-05,
1214
+ "loss": 0.18069062232971192,
1215
+ "step": 855
1216
+ },
1217
+ {
1218
+ "epoch": 0.1564774381368268,
1219
+ "grad_norm": 0.1377955675125122,
1220
+ "learning_rate": 4.794123384031495e-05,
1221
+ "loss": 0.18870222568511963,
1222
+ "step": 860
1223
+ },
1224
+ {
1225
+ "epoch": 0.15738719068413393,
1226
+ "grad_norm": 0.15901461243629456,
1227
+ "learning_rate": 4.791188325151373e-05,
1228
+ "loss": 0.18128334283828734,
1229
+ "step": 865
1230
+ },
1231
+ {
1232
+ "epoch": 0.15829694323144106,
1233
+ "grad_norm": 0.14634132385253906,
1234
+ "learning_rate": 4.7882334038869495e-05,
1235
+ "loss": 0.1866163969039917,
1236
+ "step": 870
1237
+ },
1238
+ {
1239
+ "epoch": 0.1592066957787482,
1240
+ "grad_norm": 0.15361061692237854,
1241
+ "learning_rate": 4.785258645854529e-05,
1242
+ "loss": 0.17850807905197144,
1243
+ "step": 875
1244
+ },
1245
+ {
1246
+ "epoch": 0.16011644832605532,
1247
+ "grad_norm": 0.13751649856567383,
1248
+ "learning_rate": 4.782264076842385e-05,
1249
+ "loss": 0.17731113433837892,
1250
+ "step": 880
1251
+ },
1252
+ {
1253
+ "epoch": 0.16102620087336245,
1254
+ "grad_norm": 0.17909638583660126,
1255
+ "learning_rate": 4.7792497228105314e-05,
1256
+ "loss": 0.18344542980194092,
1257
+ "step": 885
1258
+ },
1259
+ {
1260
+ "epoch": 0.16193595342066958,
1261
+ "grad_norm": 0.16038304567337036,
1262
+ "learning_rate": 4.776215609890498e-05,
1263
+ "loss": 0.18868647813796996,
1264
+ "step": 890
1265
+ },
1266
+ {
1267
+ "epoch": 0.1628457059679767,
1268
+ "grad_norm": 0.1653951108455658,
1269
+ "learning_rate": 4.773161764385107e-05,
1270
+ "loss": 0.18614152669906617,
1271
+ "step": 895
1272
+ },
1273
+ {
1274
+ "epoch": 0.16375545851528384,
1275
+ "grad_norm": 0.16193026304244995,
1276
+ "learning_rate": 4.770088212768241e-05,
1277
+ "loss": 0.18564575910568237,
1278
+ "step": 900
1279
+ },
1280
+ {
1281
+ "epoch": 0.16466521106259097,
1282
+ "grad_norm": 0.16048531234264374,
1283
+ "learning_rate": 4.7669949816846173e-05,
1284
+ "loss": 0.18330031633377075,
1285
+ "step": 905
1286
+ },
1287
+ {
1288
+ "epoch": 0.1655749636098981,
1289
+ "grad_norm": 0.1440177708864212,
1290
+ "learning_rate": 4.7638820979495534e-05,
1291
+ "loss": 0.17712442874908446,
1292
+ "step": 910
1293
+ },
1294
+ {
1295
+ "epoch": 0.16648471615720525,
1296
+ "grad_norm": 0.19635969400405884,
1297
+ "learning_rate": 4.760749588548738e-05,
1298
+ "loss": 0.18679027557373046,
1299
+ "step": 915
1300
+ },
1301
+ {
1302
+ "epoch": 0.16739446870451238,
1303
+ "grad_norm": 0.15576541423797607,
1304
+ "learning_rate": 4.757597480637995e-05,
1305
+ "loss": 0.19283764362335204,
1306
+ "step": 920
1307
+ },
1308
+ {
1309
+ "epoch": 0.1683042212518195,
1310
+ "grad_norm": 0.1550331562757492,
1311
+ "learning_rate": 4.7544258015430463e-05,
1312
+ "loss": 0.18269542455673218,
1313
+ "step": 925
1314
+ },
1315
+ {
1316
+ "epoch": 0.16921397379912664,
1317
+ "grad_norm": 0.18369626998901367,
1318
+ "learning_rate": 4.75123457875928e-05,
1319
+ "loss": 0.1697891116142273,
1320
+ "step": 930
1321
+ },
1322
+ {
1323
+ "epoch": 0.17012372634643377,
1324
+ "grad_norm": 0.15266314148902893,
1325
+ "learning_rate": 4.7480238399515074e-05,
1326
+ "loss": 0.18523451089859008,
1327
+ "step": 935
1328
+ },
1329
+ {
1330
+ "epoch": 0.1710334788937409,
1331
+ "grad_norm": 0.16709664463996887,
1332
+ "learning_rate": 4.744793612953724e-05,
1333
+ "loss": 0.1803238034248352,
1334
+ "step": 940
1335
+ },
1336
+ {
1337
+ "epoch": 0.17194323144104803,
1338
+ "grad_norm": 0.14929179847240448,
1339
+ "learning_rate": 4.741543925768872e-05,
1340
+ "loss": 0.1861217737197876,
1341
+ "step": 945
1342
+ },
1343
+ {
1344
+ "epoch": 0.17285298398835516,
1345
+ "grad_norm": 0.1362280696630478,
1346
+ "learning_rate": 4.7382748065685915e-05,
1347
+ "loss": 0.17896100282669067,
1348
+ "step": 950
1349
+ },
1350
+ {
1351
+ "epoch": 0.1737627365356623,
1352
+ "grad_norm": 0.15290239453315735,
1353
+ "learning_rate": 4.734986283692982e-05,
1354
+ "loss": 0.18432788848876952,
1355
+ "step": 955
1356
+ },
1357
+ {
1358
+ "epoch": 0.17467248908296942,
1359
+ "grad_norm": 0.1287035197019577,
1360
+ "learning_rate": 4.73167838565035e-05,
1361
+ "loss": 0.18485682010650634,
1362
+ "step": 960
1363
+ },
1364
+ {
1365
+ "epoch": 0.17558224163027655,
1366
+ "grad_norm": 0.17969627678394318,
1367
+ "learning_rate": 4.728351141116971e-05,
1368
+ "loss": 0.17361557483673096,
1369
+ "step": 965
1370
+ },
1371
+ {
1372
+ "epoch": 0.1764919941775837,
1373
+ "grad_norm": 0.13751201331615448,
1374
+ "learning_rate": 4.7250045789368326e-05,
1375
+ "loss": 0.1731679320335388,
1376
+ "step": 970
1377
+ },
1378
+ {
1379
+ "epoch": 0.17740174672489084,
1380
+ "grad_norm": 0.1603265255689621,
1381
+ "learning_rate": 4.721638728121388e-05,
1382
+ "loss": 0.17308170795440675,
1383
+ "step": 975
1384
+ },
1385
+ {
1386
+ "epoch": 0.17831149927219797,
1387
+ "grad_norm": 0.1592789888381958,
1388
+ "learning_rate": 4.718253617849306e-05,
1389
+ "loss": 0.17534757852554322,
1390
+ "step": 980
1391
+ },
1392
+ {
1393
+ "epoch": 0.1792212518195051,
1394
+ "grad_norm": 0.12727224826812744,
1395
+ "learning_rate": 4.714849277466214e-05,
1396
+ "loss": 0.17817609310150145,
1397
+ "step": 985
1398
+ },
1399
+ {
1400
+ "epoch": 0.18013100436681223,
1401
+ "grad_norm": 0.15401554107666016,
1402
+ "learning_rate": 4.711425736484447e-05,
1403
+ "loss": 0.1733405351638794,
1404
+ "step": 990
1405
+ },
1406
+ {
1407
+ "epoch": 0.18104075691411936,
1408
+ "grad_norm": 0.13253968954086304,
1409
+ "learning_rate": 4.7079830245827906e-05,
1410
+ "loss": 0.17846795320510864,
1411
+ "step": 995
1412
+ },
1413
+ {
1414
+ "epoch": 0.1819505094614265,
1415
+ "grad_norm": 0.21846213936805725,
1416
+ "learning_rate": 4.7045211716062245e-05,
1417
+ "loss": 0.18021599054336548,
1418
+ "step": 1000
1419
+ },
1420
+ {
1421
+ "epoch": 0.18286026200873362,
1422
+ "grad_norm": 0.16867990791797638,
1423
+ "learning_rate": 4.7010402075656595e-05,
1424
+ "loss": 0.18232386112213134,
1425
+ "step": 1005
1426
+ },
1427
+ {
1428
+ "epoch": 0.18377001455604075,
1429
+ "grad_norm": 0.17180582880973816,
1430
+ "learning_rate": 4.697540162637686e-05,
1431
+ "loss": 0.1816317319869995,
1432
+ "step": 1010
1433
+ },
1434
+ {
1435
+ "epoch": 0.18467976710334788,
1436
+ "grad_norm": 0.16480213403701782,
1437
+ "learning_rate": 4.694021067164303e-05,
1438
+ "loss": 0.17718446254730225,
1439
+ "step": 1015
1440
+ },
1441
+ {
1442
+ "epoch": 0.185589519650655,
1443
+ "grad_norm": 0.15015918016433716,
1444
+ "learning_rate": 4.6904829516526605e-05,
1445
+ "loss": 0.17412011623382567,
1446
+ "step": 1020
1447
+ },
1448
+ {
1449
+ "epoch": 0.18649927219796217,
1450
+ "grad_norm": 0.14445139467716217,
1451
+ "learning_rate": 4.686925846774795e-05,
1452
+ "loss": 0.1778018832206726,
1453
+ "step": 1025
1454
+ },
1455
+ {
1456
+ "epoch": 0.1874090247452693,
1457
+ "grad_norm": 0.1701960265636444,
1458
+ "learning_rate": 4.683349783367362e-05,
1459
+ "loss": 0.16901081800460815,
1460
+ "step": 1030
1461
+ },
1462
+ {
1463
+ "epoch": 0.18831877729257643,
1464
+ "grad_norm": 0.15894867479801178,
1465
+ "learning_rate": 4.679754792431368e-05,
1466
+ "loss": 0.17055928707122803,
1467
+ "step": 1035
1468
+ },
1469
+ {
1470
+ "epoch": 0.18922852983988356,
1471
+ "grad_norm": 0.1511942446231842,
1472
+ "learning_rate": 4.676140905131903e-05,
1473
+ "loss": 0.17339680194854737,
1474
+ "step": 1040
1475
+ },
1476
+ {
1477
+ "epoch": 0.1901382823871907,
1478
+ "grad_norm": 0.14735209941864014,
1479
+ "learning_rate": 4.672508152797872e-05,
1480
+ "loss": 0.17802717685699462,
1481
+ "step": 1045
1482
+ },
1483
+ {
1484
+ "epoch": 0.19104803493449782,
1485
+ "grad_norm": 0.17367291450500488,
1486
+ "learning_rate": 4.66885656692172e-05,
1487
+ "loss": 0.1732744097709656,
1488
+ "step": 1050
1489
+ },
1490
+ {
1491
+ "epoch": 0.19195778748180495,
1492
+ "grad_norm": 0.147227481007576,
1493
+ "learning_rate": 4.665186179159159e-05,
1494
+ "loss": 0.17040517330169677,
1495
+ "step": 1055
1496
+ },
1497
+ {
1498
+ "epoch": 0.19286754002911208,
1499
+ "grad_norm": 0.1709655076265335,
1500
+ "learning_rate": 4.6614970213289e-05,
1501
+ "loss": 0.17794088125228882,
1502
+ "step": 1060
1503
+ },
1504
+ {
1505
+ "epoch": 0.1937772925764192,
1506
+ "grad_norm": 0.1588088721036911,
1507
+ "learning_rate": 4.657789125412366e-05,
1508
+ "loss": 0.17180380821228028,
1509
+ "step": 1065
1510
+ },
1511
+ {
1512
+ "epoch": 0.19468704512372634,
1513
+ "grad_norm": 0.14827021956443787,
1514
+ "learning_rate": 4.654062523553428e-05,
1515
+ "loss": 0.182997989654541,
1516
+ "step": 1070
1517
+ },
1518
+ {
1519
+ "epoch": 0.19559679767103347,
1520
+ "grad_norm": 0.16230466961860657,
1521
+ "learning_rate": 4.6503172480581126e-05,
1522
+ "loss": 0.17346880435943604,
1523
+ "step": 1075
1524
+ },
1525
+ {
1526
+ "epoch": 0.1965065502183406,
1527
+ "grad_norm": 0.1637624353170395,
1528
+ "learning_rate": 4.646553331394333e-05,
1529
+ "loss": 0.17263576984405518,
1530
+ "step": 1080
1531
+ },
1532
+ {
1533
+ "epoch": 0.19741630276564776,
1534
+ "grad_norm": 0.15977843105793,
1535
+ "learning_rate": 4.642770806191603e-05,
1536
+ "loss": 0.17284308671951293,
1537
+ "step": 1085
1538
+ },
1539
+ {
1540
+ "epoch": 0.19832605531295489,
1541
+ "grad_norm": 0.15394869446754456,
1542
+ "learning_rate": 4.6389697052407534e-05,
1543
+ "loss": 0.17797101736068727,
1544
+ "step": 1090
1545
+ },
1546
+ {
1547
+ "epoch": 0.19923580786026202,
1548
+ "grad_norm": 0.15995225310325623,
1549
+ "learning_rate": 4.6351500614936485e-05,
1550
+ "loss": 0.18137198686599731,
1551
+ "step": 1095
1552
+ },
1553
+ {
1554
+ "epoch": 0.20014556040756915,
1555
+ "grad_norm": 0.1779479682445526,
1556
+ "learning_rate": 4.6313119080629006e-05,
1557
+ "loss": 0.17998344898223878,
1558
+ "step": 1100
1559
+ }
1560
+ ],
1561
+ "logging_steps": 5,
1562
+ "max_steps": 5500,
1563
+ "num_input_tokens_seen": 0,
1564
+ "num_train_epochs": 2,
1565
+ "save_steps": 100,
1566
+ "stateful_callbacks": {
1567
+ "TrainerControl": {
1568
+ "args": {
1569
+ "should_epoch_stop": false,
1570
+ "should_evaluate": false,
1571
+ "should_log": false,
1572
+ "should_save": true,
1573
+ "should_training_stop": false
1574
+ },
1575
+ "attributes": {}
1576
+ }
1577
+ },
1578
+ "total_flos": 6.127484770153037e+17,
1579
+ "train_batch_size": 8,
1580
+ "trial_name": null,
1581
+ "trial_params": null
1582
+ }
checkpoint-1100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9abb54db33ad7a1865253346fbffbda40d7b72587ff8ccf0cc69c9680b59201
3
+ size 5777
checkpoint-1200/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/gemma-4-E4B-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/gemma-4-E4B-it
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.19.1
checkpoint-1200/adapter_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Gemma4ForConditionalGeneration",
7
+ "parent_library": "transformers.models.gemma4.modeling_gemma4",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/gemma-4-E4B-it",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 16,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0.0,
26
+ "lora_ga_config": null,
27
+ "megatron_config": null,
28
+ "megatron_core": "megatron.core",
29
+ "modules_to_save": null,
30
+ "peft_type": "LORA",
31
+ "peft_version": "0.19.1",
32
+ "qalora_group_size": 16,
33
+ "r": 16,
34
+ "rank_pattern": {},
35
+ "revision": null,
36
+ "target_modules": [
37
+ "gate_proj",
38
+ "v_proj",
39
+ "o_proj",
40
+ "k_proj",
41
+ "up_proj",
42
+ "down_proj",
43
+ "q_proj"
44
+ ],
45
+ "target_parameters": null,
46
+ "task_type": "CAUSAL_LM",
47
+ "trainable_token_indices": null,
48
+ "use_bdlora": null,
49
+ "use_dora": false,
50
+ "use_qalora": false,
51
+ "use_rslora": false
52
+ }
checkpoint-1200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:758b7e5c64f7b3b9a2dfb7f9c3f402266b67013f70427ae941acb07350f0c694
3
+ size 169741912
checkpoint-1200/chat_template.jinja ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required, filter_keys=false) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- set add_comma = false -%}
6
+ {%- if not filter_keys or key not in standard_keys -%}
7
+ {%- if ns.found_first %},{% endif -%}
8
+ {%- set ns.found_first = true -%}
9
+ {{ key }}:{
10
+ {%- if value['description'] -%}
11
+ description:<|"|>{{ value['description'] }}<|"|>
12
+ {%- set add_comma = true -%}
13
+ {%- endif -%}
14
+ {%- if value['type'] | upper == 'STRING' -%}
15
+ {%- if value['enum'] -%}
16
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
17
+ enum:{{ format_argument(value['enum']) }}
18
+ {%- endif -%}
19
+ {%- elif value['type'] | upper == 'ARRAY' -%}
20
+ {%- if value['items'] is mapping and value['items'] -%}
21
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
22
+ items:{
23
+ {%- set ns_items = namespace(found_first=false) -%}
24
+ {%- for item_key, item_value in value['items'] | dictsort -%}
25
+ {%- if item_value is not none -%}
26
+ {%- if ns_items.found_first %},{% endif -%}
27
+ {%- set ns_items.found_first = true -%}
28
+ {%- if item_key == 'properties' -%}
29
+ properties:{
30
+ {%- if item_value is mapping -%}
31
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
32
+ {%- endif -%}
33
+ }
34
+ {%- elif item_key == 'required' -%}
35
+ required:[
36
+ {%- for req_item in item_value -%}
37
+ <|"|>{{- req_item -}}<|"|>
38
+ {%- if not loop.last %},{% endif -%}
39
+ {%- endfor -%}
40
+ ]
41
+ {%- elif item_key == 'type' -%}
42
+ {%- if item_value is string -%}
43
+ type:{{ format_argument(item_value | upper) }}
44
+ {%- else -%}
45
+ type:{{ format_argument(item_value | map('upper') | list) }}
46
+ {%- endif -%}
47
+ {%- else -%}
48
+ {{ item_key }}:{{ format_argument(item_value) }}
49
+ {%- endif -%}
50
+ {%- endif -%}
51
+ {%- endfor -%}
52
+ }
53
+ {%- endif -%}
54
+ {%- endif -%}
55
+ {%- if value['nullable'] %}
56
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
57
+ nullable:true
58
+ {%- endif -%}
59
+ {%- if value['type'] | upper == 'OBJECT' -%}
60
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
61
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
62
+ properties:{
63
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
64
+ }
65
+ {%- elif value is mapping -%}
66
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
67
+ properties:{
68
+ {{- format_parameters(value, value['required'] | default([]), filter_keys=true) -}}
69
+ }
70
+ {%- endif -%}
71
+ {%- if value['required'] -%}
72
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
73
+ required:[
74
+ {%- for item in value['required'] | default([]) -%}
75
+ <|"|>{{- item -}}<|"|>
76
+ {%- if not loop.last %},{% endif -%}
77
+ {%- endfor -%}
78
+ ]
79
+ {%- endif -%}
80
+ {%- endif -%}
81
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
82
+ type:<|"|>{{ value['type'] | upper }}<|"|>}
83
+ {%- endif -%}
84
+ {%- endfor -%}
85
+ {%- endmacro -%}
86
+ {%- macro format_function_declaration(tool_data) -%}
87
+ declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
88
+ {%- set params = tool_data['function']['parameters'] -%}
89
+ {%- if params -%}
90
+ ,parameters:{
91
+ {%- if params['properties'] -%}
92
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
93
+ {%- endif -%}
94
+ {%- if params['required'] -%}
95
+ required:[
96
+ {%- for item in params['required'] -%}
97
+ <|"|>{{- item -}}<|"|>
98
+ {{- ',' if not loop.last -}}
99
+ {%- endfor -%}
100
+ ],
101
+ {%- endif -%}
102
+ {%- if params['type'] -%}
103
+ type:<|"|>{{- params['type'] | upper -}}<|"|>}
104
+ {%- endif -%}
105
+ {%- endif -%}
106
+ {%- if 'response' in tool_data['function'] -%}
107
+ {%- set response_declaration = tool_data['function']['response'] -%}
108
+ ,response:{
109
+ {%- if response_declaration['description'] -%}
110
+ description:<|"|>{{- response_declaration['description'] -}}<|"|>,
111
+ {%- endif -%}
112
+ {%- if response_declaration['type'] | upper == 'OBJECT' -%}
113
+ type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
114
+ {%- endif -%}
115
+ {%- endif -%}
116
+ }
117
+ {%- endmacro -%}
118
+ {%- macro format_argument(argument, escape_keys=True) -%}
119
+ {%- if argument is string -%}
120
+ {{- '<|"|>' + argument + '<|"|>' -}}
121
+ {%- elif argument is boolean -%}
122
+ {{- 'true' if argument else 'false' -}}
123
+ {%- elif argument is mapping -%}
124
+ {{- '{' -}}
125
+ {%- set ns = namespace(found_first=false) -%}
126
+ {%- for key, value in argument | dictsort -%}
127
+ {%- if ns.found_first %},{% endif -%}
128
+ {%- set ns.found_first = true -%}
129
+ {%- if escape_keys -%}
130
+ {{- '<|"|>' + key + '<|"|>' -}}
131
+ {%- else -%}
132
+ {{- key -}}
133
+ {%- endif -%}
134
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
135
+ {%- endfor -%}
136
+ {{- '}' -}}
137
+ {%- elif argument is sequence -%}
138
+ {{- '[' -}}
139
+ {%- for item in argument -%}
140
+ {{- format_argument(item, escape_keys=escape_keys) -}}
141
+ {%- if not loop.last %},{% endif -%}
142
+ {%- endfor -%}
143
+ {{- ']' -}}
144
+ {%- else -%}
145
+ {{- argument -}}
146
+ {%- endif -%}
147
+ {%- endmacro -%}
148
+ {%- macro strip_thinking(text) -%}
149
+ {%- set ns = namespace(result='') -%}
150
+ {%- for part in text.split('<channel|>') -%}
151
+ {%- if '<|channel>' in part -%}
152
+ {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
153
+ {%- else -%}
154
+ {%- set ns.result = ns.result + part -%}
155
+ {%- endif -%}
156
+ {%- endfor -%}
157
+ {{- ns.result | trim -}}
158
+ {%- endmacro -%}
159
+
160
+ {%- macro format_tool_response_block(tool_name, response) -%}
161
+ {{- '<|tool_response>' -}}
162
+ {%- if response is mapping -%}
163
+ {{- 'response:' + tool_name + '{' -}}
164
+ {%- for key, value in response | dictsort -%}
165
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
166
+ {%- if not loop.last %},{% endif -%}
167
+ {%- endfor -%}
168
+ {{- '}' -}}
169
+ {%- else -%}
170
+ {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
171
+ {%- endif -%}
172
+ {{- '<tool_response|>' -}}
173
+ {%- endmacro -%}
174
+
175
+ {%- set ns = namespace(prev_message_type=None) -%}
176
+ {%- set loop_messages = messages -%}
177
+ {{- bos_token -}}
178
+ {#- Handle System/Tool Definitions Block -#}
179
+ {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
180
+ {{- '<|turn>system\n' -}}
181
+ {#- Inject Thinking token at the very top of the FIRST system turn -#}
182
+ {%- if enable_thinking is defined and enable_thinking -%}
183
+ {{- '<|think|>\n' -}}
184
+ {%- set ns.prev_message_type = 'think' -%}
185
+ {%- endif -%}
186
+ {%- if messages[0]['role'] in ['system', 'developer'] -%}
187
+ {%- if messages[0]['content'] is string -%}
188
+ {{- messages[0]['content'] | trim -}}
189
+ {%- elif messages[0]['content'] is sequence -%}
190
+ {%- for item in messages[0]['content'] -%}
191
+ {{- item['text'] | trim + ' '-}}
192
+ {%- endfor -%}
193
+ {%- endif -%}
194
+ {%- set loop_messages = messages[1:] -%}
195
+ {%- endif -%}
196
+ {%- if tools -%}
197
+ {%- for tool in tools %}
198
+ {{- '<|tool>' -}}
199
+ {{- format_function_declaration(tool) | trim -}}
200
+ {{- '<tool|>' -}}
201
+ {%- endfor %}
202
+ {%- set ns.prev_message_type = 'tool' -%}
203
+ {%- endif -%}
204
+ {{- '<turn|>\n' -}}
205
+ {%- endif %}
206
+
207
+ {#- Pre-scan: find last user message index for reasoning guard -#}
208
+ {%- set ns_turn = namespace(last_user_idx=-1) -%}
209
+ {%- for i in range(loop_messages | length) -%}
210
+ {%- if loop_messages[i]['role'] == 'user' -%}
211
+ {%- set ns_turn.last_user_idx = i -%}
212
+ {%- endif -%}
213
+ {%- endfor -%}
214
+
215
+ {#- Loop through messages -#}
216
+ {%- for message in loop_messages -%}
217
+ {%- if message['role'] != 'tool' -%}
218
+ {%- set ns.prev_message_type = None -%}
219
+ {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
220
+ {#- Detect continuation: suppress duplicate <|turn>model when previous non-tool message was also assistant -#}
221
+ {%- set prev_nt = namespace(role=None, found=false) -%}
222
+ {%- if loop.index0 > 0 -%}
223
+ {%- for j in range(loop.index0 - 1, -1, -1) -%}
224
+ {%- if not prev_nt.found -%}
225
+ {%- if loop_messages[j]['role'] != 'tool' -%}
226
+ {%- set prev_nt.role = loop_messages[j]['role'] -%}
227
+ {%- set prev_nt.found = true -%}
228
+ {%- endif -%}
229
+ {%- endif -%}
230
+ {%- endfor -%}
231
+ {%- endif -%}
232
+ {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
233
+ {%- if not continue_same_model_turn -%}
234
+ {{- '<|turn>' + role + '\n' }}
235
+ {%- endif -%}
236
+
237
+ {#- Render reasoning/reasoning_content as thinking channel -#}
238
+ {%- set thinking_text = message.get('reasoning') or message.get('reasoning_content') -%}
239
+ {%- if thinking_text and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
240
+ {{- '<|channel>thought\n' + thinking_text + '\n<channel|>' -}}
241
+ {%- endif -%}
242
+
243
+ {%- if message['tool_calls'] -%}
244
+ {%- for tool_call in message['tool_calls'] -%}
245
+ {%- set function = tool_call['function'] -%}
246
+ {{- '<|tool_call>call:' + function['name'] + '{' -}}
247
+ {%- if function['arguments'] is mapping -%}
248
+ {%- set ns_args = namespace(found_first=false) -%}
249
+ {%- for key, value in function['arguments'] | dictsort -%}
250
+ {%- if ns_args.found_first %},{% endif -%}
251
+ {%- set ns_args.found_first = true -%}
252
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
253
+ {%- endfor -%}
254
+ {%- elif function['arguments'] is string -%}
255
+ {{- function['arguments'] -}}
256
+ {%- endif -%}
257
+ {{- '}<tool_call|>' -}}
258
+ {%- endfor -%}
259
+ {%- set ns.prev_message_type = 'tool_call' -%}
260
+ {%- endif -%}
261
+
262
+ {%- set ns_tr_out = namespace(flag=false) -%}
263
+ {%- if message.get('tool_responses') -%}
264
+ {#- Legacy: tool_responses embedded on the assistant message (Google/Gemma native) -#}
265
+ {%- for tool_response in message['tool_responses'] -%}
266
+ {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
267
+ {%- set ns_tr_out.flag = true -%}
268
+ {%- set ns.prev_message_type = 'tool_response' -%}
269
+ {%- endfor -%}
270
+ {%- elif message.get('tool_calls') -%}
271
+ {#- OpenAI Chat Completions: forward-scan consecutive role:tool messages -#}
272
+ {%- set ns_tool_scan = namespace(stopped=false) -%}
273
+ {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
274
+ {%- if ns_tool_scan.stopped -%}
275
+ {%- elif loop_messages[k]['role'] != 'tool' -%}
276
+ {%- set ns_tool_scan.stopped = true -%}
277
+ {%- else -%}
278
+ {%- set follow = loop_messages[k] -%}
279
+ {#- Resolve tool_call_id to function name -#}
280
+ {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
281
+ {%- for tc in message['tool_calls'] -%}
282
+ {%- if tc.get('id') == follow.get('tool_call_id') -%}
283
+ {%- set ns_tname.name = tc['function']['name'] -%}
284
+ {%- endif -%}
285
+ {%- endfor -%}
286
+ {#- Handle content as string or content-parts array -#}
287
+ {%- set tool_body = follow.get('content') -%}
288
+ {%- if tool_body is string -%}
289
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
290
+ {%- elif tool_body is sequence and tool_body is not string -%}
291
+ {%- set ns_txt = namespace(s='') -%}
292
+ {%- for part in tool_body -%}
293
+ {%- if part.get('type') == 'text' -%}
294
+ {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
295
+ {%- endif -%}
296
+ {%- endfor -%}
297
+ {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
298
+ {%- else -%}
299
+ {{- format_tool_response_block(ns_tname.name, tool_body) -}}
300
+ {%- endif -%}
301
+ {%- set ns_tr_out.flag = true -%}
302
+ {%- set ns.prev_message_type = 'tool_response' -%}
303
+ {%- endif -%}
304
+ {%- endfor -%}
305
+ {%- endif -%}
306
+
307
+ {%- set captured_content -%}
308
+ {%- if message['content'] is string -%}
309
+ {%- if role == 'model' -%}
310
+ {{- strip_thinking(message['content']) -}}
311
+ {%- else -%}
312
+ {{- message['content'] | trim -}}
313
+ {%- endif -%}
314
+ {%- elif message['content'] is sequence -%}
315
+ {%- for item in message['content'] -%}
316
+ {%- if item['type'] == 'text' -%}
317
+ {%- if role == 'model' -%}
318
+ {{- strip_thinking(item['text']) -}}
319
+ {%- else -%}
320
+ {{- item['text'] | trim -}}
321
+ {%- endif -%}
322
+ {%- elif item['type'] == 'image' -%}
323
+ {{- '<|image|>' -}}
324
+ {%- set ns.prev_message_type = 'image' -%}
325
+ {%- elif item['type'] == 'audio' -%}
326
+ {{- '<|audio|>' -}}
327
+ {%- set ns.prev_message_type = 'audio' -%}
328
+ {%- elif item['type'] == 'video' -%}
329
+ {{- '<|video|>' -}}
330
+ {%- set ns.prev_message_type = 'video' -%}
331
+ {%- endif -%}
332
+ {%- endfor -%}
333
+ {%- endif -%}
334
+ {%- endset -%}
335
+
336
+ {{- captured_content -}}
337
+ {%- set has_content = captured_content | trim | length > 0 -%}
338
+
339
+ {%- if ns.prev_message_type == 'tool_call' and not ns_tr_out.flag -%}
340
+ {{- '<|tool_response>' -}}
341
+ {%- elif not (ns_tr_out.flag and not has_content) -%}
342
+ {{- '<turn|>\n' -}}
343
+ {%- endif -%}
344
+ {%- endif -%}
345
+ {%- endfor -%}
346
+
347
+ {%- if add_generation_prompt -%}
348
+ {%- if ns.prev_message_type != 'tool_response' and ns.prev_message_type != 'tool_call' -%}
349
+ {{- '<|turn>model\n' -}}
350
+ {%- endif -%}
351
+ {%- endif -%}
checkpoint-1200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e1e647229ebd58f619f9224174e6d5fab90526935a57bf68b5a5fbc119fb909
3
+ size 72807355
checkpoint-1200/processor_config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_ms_per_token": 40,
3
+ "audio_seq_length": 750,
4
+ "feature_extractor": {
5
+ "dither": 0.0,
6
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
7
+ "feature_size": 128,
8
+ "fft_length": 512,
9
+ "fft_overdrive": false,
10
+ "frame_length": 320,
11
+ "hop_length": 160,
12
+ "input_scale_factor": 1.0,
13
+ "max_frequency": 8000.0,
14
+ "mel_floor": 0.001,
15
+ "min_frequency": 0.0,
16
+ "padding_side": "left",
17
+ "padding_value": 0.0,
18
+ "per_bin_mean": null,
19
+ "per_bin_stddev": null,
20
+ "preemphasis": 0.0,
21
+ "preemphasis_htk_flavor": true,
22
+ "return_attention_mask": true,
23
+ "sampling_rate": 16000
24
+ },
25
+ "image_processor": {
26
+ "do_convert_rgb": true,
27
+ "do_normalize": false,
28
+ "do_rescale": true,
29
+ "do_resize": true,
30
+ "image_mean": [
31
+ 0.0,
32
+ 0.0,
33
+ 0.0
34
+ ],
35
+ "image_processor_type": "Gemma4ImageProcessor",
36
+ "image_seq_length": 280,
37
+ "image_std": [
38
+ 1.0,
39
+ 1.0,
40
+ 1.0
41
+ ],
42
+ "max_soft_tokens": 280,
43
+ "patch_size": 16,
44
+ "pooling_kernel_size": 3,
45
+ "resample": 3,
46
+ "rescale_factor": 0.00392156862745098
47
+ },
48
+ "image_seq_length": 280,
49
+ "processor_class": "Gemma4Processor",
50
+ "video_processor": {
51
+ "do_convert_rgb": true,
52
+ "do_normalize": true,
53
+ "do_rescale": true,
54
+ "do_resize": true,
55
+ "do_sample_frames": true,
56
+ "image_mean": [
57
+ 0.0,
58
+ 0.0,
59
+ 0.0
60
+ ],
61
+ "image_std": [
62
+ 1.0,
63
+ 1.0,
64
+ 1.0
65
+ ],
66
+ "max_soft_tokens": 70,
67
+ "num_frames": 32,
68
+ "patch_size": 16,
69
+ "pooling_kernel_size": 3,
70
+ "resample": 3,
71
+ "rescale_factor": 0.00392156862745098,
72
+ "return_metadata": false,
73
+ "video_processor_type": "Gemma4VideoProcessor"
74
+ }
75
+ }
checkpoint-1200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
+ size 14645
checkpoint-1200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efcf962131305188aae5d8c42fb21f39c330e15fc73bc76b4411e357b0d01cee
3
+ size 1465
checkpoint-1200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
3
+ size 32169626