alexgrigoras commited on
Commit
e623ba8
·
verified ·
1 Parent(s): 95cb259

Upload SDG checkpoint

Browse files
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: time-series-forecasting
4
+ tags:
5
+ - time-series
6
+ - synthetic-data
7
+ - seq2seq
8
+ - retail
9
+ - qlora
10
+ base_model: amazon/chronos-t5-small
11
+ ---
12
+
13
+ # alexgrigoras/sdg_chronos_t5_small_dunnhumby
14
+
15
+ Synthetic time-series generation checkpoint for the DIF-PI framework.
16
+
17
+ ## Model summary
18
+
19
+ This checkpoint is trained as a seq2seq generator on tokenized retail demand windows. It uses a T5-style encoder-decoder backbone, QLoRA when available, extended time-series special tokens, calendar conditioning, multiple-sample generation, and a seasonality-aware calibration step at inference time.
20
+
21
+ ## Intended use
22
+
23
+ The model is intended for research on synthetic retail demand generation and validation inside the DIF-PI framework. It is not intended for safety-critical or fully autonomous business decisions without human review.
24
+
25
+ ## Training setup
26
+
27
+ - Base model: amazon/chronos-t5-small
28
+ - Context length: 140
29
+ - Prediction length: 30
30
+ - Quantization bins: 4094
31
+ - Backend: lora
adapter_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "amazon/chronos-t5-small",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "wo",
33
+ "q",
34
+ "v",
35
+ "o",
36
+ "k",
37
+ "wi"
38
+ ],
39
+ "target_parameters": null,
40
+ "task_type": "SEQ_2_SEQ_LM",
41
+ "trainable_token_indices": null,
42
+ "use_dora": false,
43
+ "use_qalora": false,
44
+ "use_rslora": false
45
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74ad9c4192071e1828c2569d41578cc93e5e765303d67fb476d048e7ca38e371
3
+ size 34675328
checkpoint-1500/README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: amazon/chronos-t5-small
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:amazon/chronos-t5-small
6
+ - lora
7
+ - transformers
8
+ ---
9
+
10
+ # Model Card for Model ID
11
+
12
+ <!-- Provide a quick summary of what the model is/does. -->
13
+
14
+
15
+
16
+ ## Model Details
17
+
18
+ ### Model Description
19
+
20
+ <!-- Provide a longer summary of what this model is. -->
21
+
22
+
23
+
24
+ - **Developed by:** [More Information Needed]
25
+ - **Funded by [optional]:** [More Information Needed]
26
+ - **Shared by [optional]:** [More Information Needed]
27
+ - **Model type:** [More Information Needed]
28
+ - **Language(s) (NLP):** [More Information Needed]
29
+ - **License:** [More Information Needed]
30
+ - **Finetuned from model [optional]:** [More Information Needed]
31
+
32
+ ### Model Sources [optional]
33
+
34
+ <!-- Provide the basic links for the model. -->
35
+
36
+ - **Repository:** [More Information Needed]
37
+ - **Paper [optional]:** [More Information Needed]
38
+ - **Demo [optional]:** [More Information Needed]
39
+
40
+ ## Uses
41
+
42
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
43
+
44
+ ### Direct Use
45
+
46
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
47
+
48
+ [More Information Needed]
49
+
50
+ ### Downstream Use [optional]
51
+
52
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
53
+
54
+ [More Information Needed]
55
+
56
+ ### Out-of-Scope Use
57
+
58
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
59
+
60
+ [More Information Needed]
61
+
62
+ ## Bias, Risks, and Limitations
63
+
64
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
65
+
66
+ [More Information Needed]
67
+
68
+ ### Recommendations
69
+
70
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
71
+
72
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
73
+
74
+ ## How to Get Started with the Model
75
+
76
+ Use the code below to get started with the model.
77
+
78
+ [More Information Needed]
79
+
80
+ ## Training Details
81
+
82
+ ### Training Data
83
+
84
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
85
+
86
+ [More Information Needed]
87
+
88
+ ### Training Procedure
89
+
90
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
91
+
92
+ #### Preprocessing [optional]
93
+
94
+ [More Information Needed]
95
+
96
+
97
+ #### Training Hyperparameters
98
+
99
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
100
+
101
+ #### Speeds, Sizes, Times [optional]
102
+
103
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
104
+
105
+ [More Information Needed]
106
+
107
+ ## Evaluation
108
+
109
+ <!-- This section describes the evaluation protocols and provides the results. -->
110
+
111
+ ### Testing Data, Factors & Metrics
112
+
113
+ #### Testing Data
114
+
115
+ <!-- This should link to a Dataset Card if possible. -->
116
+
117
+ [More Information Needed]
118
+
119
+ #### Factors
120
+
121
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
122
+
123
+ [More Information Needed]
124
+
125
+ #### Metrics
126
+
127
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
128
+
129
+ [More Information Needed]
130
+
131
+ ### Results
132
+
133
+ [More Information Needed]
134
+
135
+ #### Summary
136
+
137
+
138
+
139
+ ## Model Examination [optional]
140
+
141
+ <!-- Relevant interpretability work for the model goes here -->
142
+
143
+ [More Information Needed]
144
+
145
+ ## Environmental Impact
146
+
147
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
148
+
149
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
150
+
151
+ - **Hardware Type:** [More Information Needed]
152
+ - **Hours used:** [More Information Needed]
153
+ - **Cloud Provider:** [More Information Needed]
154
+ - **Compute Region:** [More Information Needed]
155
+ - **Carbon Emitted:** [More Information Needed]
156
+
157
+ ## Technical Specifications [optional]
158
+
159
+ ### Model Architecture and Objective
160
+
161
+ [More Information Needed]
162
+
163
+ ### Compute Infrastructure
164
+
165
+ [More Information Needed]
166
+
167
+ #### Hardware
168
+
169
+ [More Information Needed]
170
+
171
+ #### Software
172
+
173
+ [More Information Needed]
174
+
175
+ ## Citation [optional]
176
+
177
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
178
+
179
+ **BibTeX:**
180
+
181
+ [More Information Needed]
182
+
183
+ **APA:**
184
+
185
+ [More Information Needed]
186
+
187
+ ## Glossary [optional]
188
+
189
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
190
+
191
+ [More Information Needed]
192
+
193
+ ## More Information [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Authors [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Contact
202
+
203
+ [More Information Needed]
204
+ ### Framework versions
205
+
206
+ - PEFT 0.18.1
checkpoint-1500/adapter_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "amazon/chronos-t5-small",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "wo",
33
+ "q",
34
+ "v",
35
+ "o",
36
+ "k",
37
+ "wi"
38
+ ],
39
+ "target_parameters": null,
40
+ "task_type": "SEQ_2_SEQ_LM",
41
+ "trainable_token_indices": null,
42
+ "use_dora": false,
43
+ "use_qalora": false,
44
+ "use_rslora": false
45
+ }
checkpoint-1500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b78c42f7f979d351414ccfed5d2637fda4acc4db8ed9f5a0f72c716808a697d9
3
+ size 34675328
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fb65a8c4fa0ab45b7980247fc3f91382b6dee186ff9b44d5afd40cc18176eb8
3
+ size 34759371
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1353004704dfc9675e2e77ea9719cf819d6ab9a0e8ee90a120642c6b15504576
3
+ size 14391
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b666be922ac85995493c1149e03e49db80a97ac1fe5383402fa9756f48bfd3f8
3
+ size 1465
checkpoint-1500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "eos_token": "</s>",
4
+ "extra_ids": 100,
5
+ "extra_special_tokens": [
6
+ "<extra_id_0>",
7
+ "<extra_id_1>",
8
+ "<extra_id_2>",
9
+ "<extra_id_3>",
10
+ "<extra_id_4>",
11
+ "<extra_id_5>",
12
+ "<extra_id_6>",
13
+ "<extra_id_7>",
14
+ "<extra_id_8>",
15
+ "<extra_id_9>",
16
+ "<extra_id_10>",
17
+ "<extra_id_11>",
18
+ "<extra_id_12>",
19
+ "<extra_id_13>",
20
+ "<extra_id_14>",
21
+ "<extra_id_15>",
22
+ "<extra_id_16>",
23
+ "<extra_id_17>",
24
+ "<extra_id_18>",
25
+ "<extra_id_19>",
26
+ "<extra_id_20>",
27
+ "<extra_id_21>",
28
+ "<extra_id_22>",
29
+ "<extra_id_23>",
30
+ "<extra_id_24>",
31
+ "<extra_id_25>",
32
+ "<extra_id_26>",
33
+ "<extra_id_27>",
34
+ "<extra_id_28>",
35
+ "<extra_id_29>",
36
+ "<extra_id_30>",
37
+ "<extra_id_31>",
38
+ "<extra_id_32>",
39
+ "<extra_id_33>",
40
+ "<extra_id_34>",
41
+ "<extra_id_35>",
42
+ "<extra_id_36>",
43
+ "<extra_id_37>",
44
+ "<extra_id_38>",
45
+ "<extra_id_39>",
46
+ "<extra_id_40>",
47
+ "<extra_id_41>",
48
+ "<extra_id_42>",
49
+ "<extra_id_43>",
50
+ "<extra_id_44>",
51
+ "<extra_id_45>",
52
+ "<extra_id_46>",
53
+ "<extra_id_47>",
54
+ "<extra_id_48>",
55
+ "<extra_id_49>",
56
+ "<extra_id_50>",
57
+ "<extra_id_51>",
58
+ "<extra_id_52>",
59
+ "<extra_id_53>",
60
+ "<extra_id_54>",
61
+ "<extra_id_55>",
62
+ "<extra_id_56>",
63
+ "<extra_id_57>",
64
+ "<extra_id_58>",
65
+ "<extra_id_59>",
66
+ "<extra_id_60>",
67
+ "<extra_id_61>",
68
+ "<extra_id_62>",
69
+ "<extra_id_63>",
70
+ "<extra_id_64>",
71
+ "<extra_id_65>",
72
+ "<extra_id_66>",
73
+ "<extra_id_67>",
74
+ "<extra_id_68>",
75
+ "<extra_id_69>",
76
+ "<extra_id_70>",
77
+ "<extra_id_71>",
78
+ "<extra_id_72>",
79
+ "<extra_id_73>",
80
+ "<extra_id_74>",
81
+ "<extra_id_75>",
82
+ "<extra_id_76>",
83
+ "<extra_id_77>",
84
+ "<extra_id_78>",
85
+ "<extra_id_79>",
86
+ "<extra_id_80>",
87
+ "<extra_id_81>",
88
+ "<extra_id_82>",
89
+ "<extra_id_83>",
90
+ "<extra_id_84>",
91
+ "<extra_id_85>",
92
+ "<extra_id_86>",
93
+ "<extra_id_87>",
94
+ "<extra_id_88>",
95
+ "<extra_id_89>",
96
+ "<extra_id_90>",
97
+ "<extra_id_91>",
98
+ "<extra_id_92>",
99
+ "<extra_id_93>",
100
+ "<extra_id_94>",
101
+ "<extra_id_95>",
102
+ "<extra_id_96>",
103
+ "<extra_id_97>",
104
+ "<extra_id_98>",
105
+ "<extra_id_99>"
106
+ ],
107
+ "is_local": false,
108
+ "model_max_length": 1000000000000000019884624838656,
109
+ "pad_token": "<pad>",
110
+ "tokenizer_class": "T5Tokenizer",
111
+ "unk_token": "<unk>"
112
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1475,
3
+ "best_metric": 4.664707660675049,
4
+ "best_model_checkpoint": "/Users/alexgrigoras/Library/Mobile Documents/com~apple~CloudDocs/[5] Software/github/dif-pi/artifacts/models/sdg_chronos_t5_small_dunnhumby/checkpoint-750",
5
+ "epoch": 0.6211823170100425,
6
+ "eval_steps": 25,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.01035303861683404,
14
+ "grad_norm": 0.2731291949748993,
15
+ "learning_rate": 8.000000000000001e-06,
16
+ "loss": 45.1662451171875,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.01035303861683404,
21
+ "eval_loss": 5.382182598114014,
22
+ "eval_runtime": 60.5419,
23
+ "eval_samples_per_second": 70.91,
24
+ "eval_steps_per_second": 35.463,
25
+ "step": 25
26
+ },
27
+ {
28
+ "epoch": 0.02070607723366808,
29
+ "grad_norm": 0.2949911952018738,
30
+ "learning_rate": 1.6333333333333335e-05,
31
+ "loss": 44.3224951171875,
32
+ "step": 50
33
+ },
34
+ {
35
+ "epoch": 0.02070607723366808,
36
+ "eval_loss": 5.343369960784912,
37
+ "eval_runtime": 47.1772,
38
+ "eval_samples_per_second": 90.997,
39
+ "eval_steps_per_second": 45.509,
40
+ "step": 50
41
+ },
42
+ {
43
+ "epoch": 0.031059115850502122,
44
+ "grad_norm": 0.2861124277114868,
45
+ "learning_rate": 2.466666666666667e-05,
46
+ "loss": 43.54326171875,
47
+ "step": 75
48
+ },
49
+ {
50
+ "epoch": 0.031059115850502122,
51
+ "eval_loss": 5.254676818847656,
52
+ "eval_runtime": 48.6137,
53
+ "eval_samples_per_second": 88.308,
54
+ "eval_steps_per_second": 44.164,
55
+ "step": 75
56
+ },
57
+ {
58
+ "epoch": 0.04141215446733616,
59
+ "grad_norm": 0.3088361918926239,
60
+ "learning_rate": 2.498250672211728e-05,
61
+ "loss": 43.87701171875,
62
+ "step": 100
63
+ },
64
+ {
65
+ "epoch": 0.04141215446733616,
66
+ "eval_loss": 5.151735305786133,
67
+ "eval_runtime": 46.8392,
68
+ "eval_samples_per_second": 91.654,
69
+ "eval_steps_per_second": 45.838,
70
+ "step": 100
71
+ },
72
+ {
73
+ "epoch": 0.05176519308417021,
74
+ "grad_norm": 0.36366939544677734,
75
+ "learning_rate": 2.4927134858925575e-05,
76
+ "loss": 41.4726220703125,
77
+ "step": 125
78
+ },
79
+ {
80
+ "epoch": 0.05176519308417021,
81
+ "eval_loss": 5.072572231292725,
82
+ "eval_runtime": 43.9205,
83
+ "eval_samples_per_second": 97.745,
84
+ "eval_steps_per_second": 48.884,
85
+ "step": 125
86
+ },
87
+ {
88
+ "epoch": 0.062118231701004244,
89
+ "grad_norm": 0.44364720582962036,
90
+ "learning_rate": 2.4834022195605383e-05,
91
+ "loss": 41.1882958984375,
92
+ "step": 150
93
+ },
94
+ {
95
+ "epoch": 0.062118231701004244,
96
+ "eval_loss": 4.995686054229736,
97
+ "eval_runtime": 53.0192,
98
+ "eval_samples_per_second": 80.971,
99
+ "eval_steps_per_second": 40.495,
100
+ "step": 150
101
+ },
102
+ {
103
+ "epoch": 0.07247127031783829,
104
+ "grad_norm": 0.4700476825237274,
105
+ "learning_rate": 2.470345151225491e-05,
106
+ "loss": 41.691572265625,
107
+ "step": 175
108
+ },
109
+ {
110
+ "epoch": 0.07247127031783829,
111
+ "eval_loss": 4.918369293212891,
112
+ "eval_runtime": 47.97,
113
+ "eval_samples_per_second": 89.493,
114
+ "eval_steps_per_second": 44.757,
115
+ "step": 175
116
+ },
117
+ {
118
+ "epoch": 0.08282430893467232,
119
+ "grad_norm": 0.42324015498161316,
120
+ "learning_rate": 2.4535819347748074e-05,
121
+ "loss": 39.7604541015625,
122
+ "step": 200
123
+ },
124
+ {
125
+ "epoch": 0.08282430893467232,
126
+ "eval_loss": 4.853856086730957,
127
+ "eval_runtime": 45.9268,
128
+ "eval_samples_per_second": 93.475,
129
+ "eval_steps_per_second": 46.748,
130
+ "step": 200
131
+ },
132
+ {
133
+ "epoch": 0.09317734755150636,
134
+ "grad_norm": 0.4664323925971985,
135
+ "learning_rate": 2.433163479545898e-05,
136
+ "loss": 40.16502197265625,
137
+ "step": 225
138
+ },
139
+ {
140
+ "epoch": 0.09317734755150636,
141
+ "eval_loss": 4.832671642303467,
142
+ "eval_runtime": 49.0803,
143
+ "eval_samples_per_second": 87.469,
144
+ "eval_steps_per_second": 43.745,
145
+ "step": 225
146
+ },
147
+ {
148
+ "epoch": 0.10353038616834041,
149
+ "grad_norm": 0.475277304649353,
150
+ "learning_rate": 2.4091517957162068e-05,
151
+ "loss": 40.338056640625,
152
+ "step": 250
153
+ },
154
+ {
155
+ "epoch": 0.10353038616834041,
156
+ "eval_loss": 4.815681457519531,
157
+ "eval_runtime": 45.7189,
158
+ "eval_samples_per_second": 93.9,
159
+ "eval_steps_per_second": 46.961,
160
+ "step": 250
161
+ },
162
+ {
163
+ "epoch": 0.11388342478517445,
164
+ "grad_norm": 0.46192488074302673,
165
+ "learning_rate": 2.3816198059803415e-05,
166
+ "loss": 39.81144287109375,
167
+ "step": 275
168
+ },
169
+ {
170
+ "epoch": 0.11388342478517445,
171
+ "eval_loss": 4.8025665283203125,
172
+ "eval_runtime": 44.0374,
173
+ "eval_samples_per_second": 97.485,
174
+ "eval_steps_per_second": 48.754,
175
+ "step": 275
176
+ },
177
+ {
178
+ "epoch": 0.12423646340200849,
179
+ "grad_norm": 0.44529587030410767,
180
+ "learning_rate": 2.350651124086246e-05,
181
+ "loss": 40.06572509765625,
182
+ "step": 300
183
+ },
184
+ {
185
+ "epoch": 0.12423646340200849,
186
+ "eval_loss": 4.791704177856445,
187
+ "eval_runtime": 46.8164,
188
+ "eval_samples_per_second": 91.699,
189
+ "eval_steps_per_second": 45.86,
190
+ "step": 300
191
+ },
192
+ {
193
+ "epoch": 0.13458950201884254,
194
+ "grad_norm": 0.47136980295181274,
195
+ "learning_rate": 2.316339800902997e-05,
196
+ "loss": 39.34464599609375,
197
+ "step": 325
198
+ },
199
+ {
200
+ "epoch": 0.13458950201884254,
201
+ "eval_loss": 4.781772136688232,
202
+ "eval_runtime": 46.7709,
203
+ "eval_samples_per_second": 91.788,
204
+ "eval_steps_per_second": 45.905,
205
+ "step": 325
206
+ },
207
+ {
208
+ "epoch": 0.14494254063567658,
209
+ "grad_norm": 0.5043098330497742,
210
+ "learning_rate": 2.2787900387914035e-05,
211
+ "loss": 40.12859619140625,
212
+ "step": 350
213
+ },
214
+ {
215
+ "epoch": 0.14494254063567658,
216
+ "eval_loss": 4.77421236038208,
217
+ "eval_runtime": 46.6372,
218
+ "eval_samples_per_second": 92.051,
219
+ "eval_steps_per_second": 46.036,
220
+ "step": 350
221
+ },
222
+ {
223
+ "epoch": 0.1552955792525106,
224
+ "grad_norm": 0.43915286660194397,
225
+ "learning_rate": 2.238115875144865e-05,
226
+ "loss": 40.11051513671875,
227
+ "step": 375
228
+ },
229
+ {
230
+ "epoch": 0.1552955792525106,
231
+ "eval_loss": 4.768870830535889,
232
+ "eval_runtime": 46.1314,
233
+ "eval_samples_per_second": 93.06,
234
+ "eval_steps_per_second": 46.541,
235
+ "step": 375
236
+ },
237
+ {
238
+ "epoch": 0.16564861786934465,
239
+ "grad_norm": 0.4874376058578491,
240
+ "learning_rate": 2.1944408360615527e-05,
241
+ "loss": 40.54395751953125,
242
+ "step": 400
243
+ },
244
+ {
245
+ "epoch": 0.16564861786934465,
246
+ "eval_loss": 4.759785175323486,
247
+ "eval_runtime": 45.0656,
248
+ "eval_samples_per_second": 95.261,
249
+ "eval_steps_per_second": 47.642,
250
+ "step": 400
251
+ },
252
+ {
253
+ "epoch": 0.1760016564861787,
254
+ "grad_norm": 0.47157636284828186,
255
+ "learning_rate": 2.147897561199711e-05,
256
+ "loss": 38.57564208984375,
257
+ "step": 425
258
+ },
259
+ {
260
+ "epoch": 0.1760016564861787,
261
+ "eval_loss": 4.7538371086120605,
262
+ "eval_runtime": 45.1789,
263
+ "eval_samples_per_second": 95.022,
264
+ "eval_steps_per_second": 47.522,
265
+ "step": 425
266
+ },
267
+ {
268
+ "epoch": 0.18635469510301272,
269
+ "grad_norm": 0.5695982575416565,
270
+ "learning_rate": 2.0986274009553747e-05,
271
+ "loss": 40.2056494140625,
272
+ "step": 450
273
+ },
274
+ {
275
+ "epoch": 0.18635469510301272,
276
+ "eval_loss": 4.746274471282959,
277
+ "eval_runtime": 48.2349,
278
+ "eval_samples_per_second": 89.002,
279
+ "eval_steps_per_second": 44.511,
280
+ "step": 450
281
+ },
282
+ {
283
+ "epoch": 0.19670773371984676,
284
+ "grad_norm": 0.4859912097454071,
285
+ "learning_rate": 2.0467799871858624e-05,
286
+ "loss": 39.90147705078125,
287
+ "step": 475
288
+ },
289
+ {
290
+ "epoch": 0.19670773371984676,
291
+ "eval_loss": 4.741403579711914,
292
+ "eval_runtime": 47.2353,
293
+ "eval_samples_per_second": 90.885,
294
+ "eval_steps_per_second": 45.453,
295
+ "step": 475
296
+ },
297
+ {
298
+ "epoch": 0.20706077233668083,
299
+ "grad_norm": 0.5383442640304565,
300
+ "learning_rate": 1.9925127787827415e-05,
301
+ "loss": 39.66552001953125,
302
+ "step": 500
303
+ },
304
+ {
305
+ "epoch": 0.20706077233668083,
306
+ "eval_loss": 4.736755847930908,
307
+ "eval_runtime": 43.4301,
308
+ "eval_samples_per_second": 98.849,
309
+ "eval_steps_per_second": 49.436,
310
+ "step": 500
311
+ },
312
+ {
313
+ "epoch": 0.21741381095351486,
314
+ "grad_norm": 0.47965624928474426,
315
+ "learning_rate": 1.9359905834743513e-05,
316
+ "loss": 39.6004296875,
317
+ "step": 525
318
+ },
319
+ {
320
+ "epoch": 0.21741381095351486,
321
+ "eval_loss": 4.732944011688232,
322
+ "eval_runtime": 43.41,
323
+ "eval_samples_per_second": 98.894,
324
+ "eval_steps_per_second": 49.459,
325
+ "step": 525
326
+ },
327
+ {
328
+ "epoch": 0.2277668495703489,
329
+ "grad_norm": 0.5654281973838806,
330
+ "learning_rate": 1.8773850573101503e-05,
331
+ "loss": 39.6916064453125,
332
+ "step": 550
333
+ },
334
+ {
335
+ "epoch": 0.2277668495703489,
336
+ "eval_loss": 4.729019641876221,
337
+ "eval_runtime": 43.337,
338
+ "eval_samples_per_second": 99.061,
339
+ "eval_steps_per_second": 49.542,
340
+ "step": 550
341
+ },
342
+ {
343
+ "epoch": 0.23811988818718294,
344
+ "grad_norm": 0.562452495098114,
345
+ "learning_rate": 1.8168741833469327e-05,
346
+ "loss": 39.9837548828125,
347
+ "step": 575
348
+ },
349
+ {
350
+ "epoch": 0.23811988818718294,
351
+ "eval_loss": 4.7265305519104,
352
+ "eval_runtime": 43.4156,
353
+ "eval_samples_per_second": 98.882,
354
+ "eval_steps_per_second": 49.452,
355
+ "step": 575
356
+ },
357
+ {
358
+ "epoch": 0.24847292680401697,
359
+ "grad_norm": 0.6212955117225647,
360
+ "learning_rate": 1.7546417311201357e-05,
361
+ "loss": 39.41627685546875,
362
+ "step": 600
363
+ },
364
+ {
365
+ "epoch": 0.24847292680401697,
366
+ "eval_loss": 4.7227606773376465,
367
+ "eval_runtime": 43.4821,
368
+ "eval_samples_per_second": 98.73,
369
+ "eval_steps_per_second": 49.377,
370
+ "step": 600
371
+ },
372
+ {
373
+ "epoch": 0.258825965420851,
374
+ "grad_norm": 0.54314124584198,
375
+ "learning_rate": 1.690876698541802e-05,
376
+ "loss": 39.156318359375,
377
+ "step": 625
378
+ },
379
+ {
380
+ "epoch": 0.258825965420851,
381
+ "eval_loss": 4.717469215393066,
382
+ "eval_runtime": 43.0545,
383
+ "eval_samples_per_second": 99.711,
384
+ "eval_steps_per_second": 49.867,
385
+ "step": 625
386
+ },
387
+ {
388
+ "epoch": 0.2691790040376851,
389
+ "grad_norm": 0.6269752383232117,
390
+ "learning_rate": 1.625772737920128e-05,
391
+ "loss": 39.20115966796875,
392
+ "step": 650
393
+ },
394
+ {
395
+ "epoch": 0.2691790040376851,
396
+ "eval_loss": 4.716719150543213,
397
+ "eval_runtime": 120.2804,
398
+ "eval_samples_per_second": 35.692,
399
+ "eval_steps_per_second": 17.85,
400
+ "step": 650
401
+ },
402
+ {
403
+ "epoch": 0.2795320426545191,
404
+ "grad_norm": 0.5073297023773193,
405
+ "learning_rate": 1.5595275678437756e-05,
406
+ "loss": 39.50381591796875,
407
+ "step": 675
408
+ },
409
+ {
410
+ "epoch": 0.2795320426545191,
411
+ "eval_loss": 4.712583065032959,
412
+ "eval_runtime": 43.4686,
413
+ "eval_samples_per_second": 98.761,
414
+ "eval_steps_per_second": 49.392,
415
+ "step": 675
416
+ },
417
+ {
418
+ "epoch": 0.28988508127135315,
419
+ "grad_norm": 0.5422746539115906,
420
+ "learning_rate": 1.4923423727170106e-05,
421
+ "loss": 38.739453125,
422
+ "step": 700
423
+ },
424
+ {
425
+ "epoch": 0.28988508127135315,
426
+ "eval_loss": 4.711677074432373,
427
+ "eval_runtime": 45.4531,
428
+ "eval_samples_per_second": 94.449,
429
+ "eval_steps_per_second": 47.235,
430
+ "step": 700
431
+ },
432
+ {
433
+ "epoch": 0.30023811988818716,
434
+ "grad_norm": 0.5396411418914795,
435
+ "learning_rate": 1.4244211917692812e-05,
436
+ "loss": 38.6535791015625,
437
+ "step": 725
438
+ },
439
+ {
440
+ "epoch": 0.30023811988818716,
441
+ "eval_loss": 4.707785606384277,
442
+ "eval_runtime": 45.9015,
443
+ "eval_samples_per_second": 93.526,
444
+ "eval_steps_per_second": 46.774,
445
+ "step": 725
446
+ },
447
+ {
448
+ "epoch": 0.3105911585050212,
449
+ "grad_norm": 0.6173298358917236,
450
+ "learning_rate": 1.355970299394786e-05,
451
+ "loss": 38.515927734375,
452
+ "step": 750
453
+ },
454
+ {
455
+ "epoch": 0.3105911585050212,
456
+ "eval_loss": 4.705667495727539,
457
+ "eval_runtime": 46.4706,
458
+ "eval_samples_per_second": 92.381,
459
+ "eval_steps_per_second": 46.201,
460
+ "step": 750
461
+ },
462
+ {
463
+ "epoch": 0.3209441971218553,
464
+ "grad_norm": 0.7035080790519714,
465
+ "learning_rate": 1.2871975787039157e-05,
466
+ "loss": 38.55907470703125,
467
+ "step": 775
468
+ },
469
+ {
470
+ "epoch": 0.3209441971218553,
471
+ "eval_loss": 4.701103687286377,
472
+ "eval_runtime": 45.1696,
473
+ "eval_samples_per_second": 95.042,
474
+ "eval_steps_per_second": 47.532,
475
+ "step": 775
476
+ },
477
+ {
478
+ "epoch": 0.3312972357386893,
479
+ "grad_norm": 0.5796102285385132,
480
+ "learning_rate": 1.218311890189081e-05,
481
+ "loss": 39.2241943359375,
482
+ "step": 800
483
+ },
484
+ {
485
+ "epoch": 0.3312972357386893,
486
+ "eval_loss": 4.699450492858887,
487
+ "eval_runtime": 44.0268,
488
+ "eval_samples_per_second": 97.509,
489
+ "eval_steps_per_second": 48.766,
490
+ "step": 800
491
+ },
492
+ {
493
+ "epoch": 0.34165027435552336,
494
+ "grad_norm": 0.6741734147071838,
495
+ "learning_rate": 1.1495224374222602e-05,
496
+ "loss": 39.471005859375,
497
+ "step": 825
498
+ },
499
+ {
500
+ "epoch": 0.34165027435552336,
501
+ "eval_loss": 4.693467617034912,
502
+ "eval_runtime": 53.2324,
503
+ "eval_samples_per_second": 80.646,
504
+ "eval_steps_per_second": 40.333,
505
+ "step": 825
506
+ },
507
+ {
508
+ "epoch": 0.3520033129723574,
509
+ "grad_norm": 0.6025944352149963,
510
+ "learning_rate": 1.0810381317106293e-05,
511
+ "loss": 40.34841064453125,
512
+ "step": 850
513
+ },
514
+ {
515
+ "epoch": 0.3520033129723574,
516
+ "eval_loss": 4.69282865524292,
517
+ "eval_runtime": 46.3536,
518
+ "eval_samples_per_second": 92.614,
519
+ "eval_steps_per_second": 46.318,
520
+ "step": 850
521
+ },
522
+ {
523
+ "epoch": 0.36235635158919144,
524
+ "grad_norm": 0.6447405219078064,
525
+ "learning_rate": 1.013066957639785e-05,
526
+ "loss": 39.40240478515625,
527
+ "step": 875
528
+ },
529
+ {
530
+ "epoch": 0.36235635158919144,
531
+ "eval_loss": 4.688658237457275,
532
+ "eval_runtime": 48.7748,
533
+ "eval_samples_per_second": 88.017,
534
+ "eval_steps_per_second": 44.019,
535
+ "step": 875
536
+ },
537
+ {
538
+ "epoch": 0.37270939020602545,
539
+ "grad_norm": 0.6256803870201111,
540
+ "learning_rate": 9.45815341431398e-06,
541
+ "loss": 40.3128173828125,
542
+ "step": 900
543
+ },
544
+ {
545
+ "epoch": 0.37270939020602545,
546
+ "eval_loss": 4.6877264976501465,
547
+ "eval_runtime": 46.4191,
548
+ "eval_samples_per_second": 92.484,
549
+ "eval_steps_per_second": 46.253,
550
+ "step": 900
551
+ },
552
+ {
553
+ "epoch": 0.3830624288228595,
554
+ "grad_norm": 0.6555071473121643,
555
+ "learning_rate": 8.79487524033558e-06,
556
+ "loss": 38.77753173828125,
557
+ "step": 925
558
+ },
559
+ {
560
+ "epoch": 0.3830624288228595,
561
+ "eval_loss": 4.686131477355957,
562
+ "eval_runtime": 45.5114,
563
+ "eval_samples_per_second": 94.328,
564
+ "eval_steps_per_second": 47.175,
565
+ "step": 925
566
+ },
567
+ {
568
+ "epoch": 0.3934154674396935,
569
+ "grad_norm": 0.5732009410858154,
570
+ "learning_rate": 8.142849408477312e-06,
571
+ "loss": 39.21095458984375,
572
+ "step": 950
573
+ },
574
+ {
575
+ "epoch": 0.3934154674396935,
576
+ "eval_loss": 4.684043884277344,
577
+ "eval_runtime": 43.9317,
578
+ "eval_samples_per_second": 97.72,
579
+ "eval_steps_per_second": 48.871,
580
+ "step": 950
581
+ },
582
+ {
583
+ "epoch": 0.4037685060565276,
584
+ "grad_norm": 0.7316587567329407,
585
+ "learning_rate": 7.504056099760629e-06,
586
+ "loss": 40.310380859375,
587
+ "step": 975
588
+ },
589
+ {
590
+ "epoch": 0.4037685060565276,
591
+ "eval_loss": 4.681900978088379,
592
+ "eval_runtime": 45.7448,
593
+ "eval_samples_per_second": 93.847,
594
+ "eval_steps_per_second": 46.934,
595
+ "step": 975
596
+ },
597
+ {
598
+ "epoch": 0.41412154467336165,
599
+ "grad_norm": 0.7832669019699097,
600
+ "learning_rate": 6.880435308469124e-06,
601
+ "loss": 39.5330810546875,
602
+ "step": 1000
603
+ },
604
+ {
605
+ "epoch": 0.41412154467336165,
606
+ "eval_loss": 4.680942058563232,
607
+ "eval_runtime": 47.3741,
608
+ "eval_samples_per_second": 90.619,
609
+ "eval_steps_per_second": 45.32,
610
+ "step": 1000
611
+ },
612
+ {
613
+ "epoch": 0.42447458329019566,
614
+ "grad_norm": 0.7218672633171082,
615
+ "learning_rate": 6.273880950449624e-06,
616
+ "loss": 40.64116455078125,
617
+ "step": 1025
618
+ },
619
+ {
620
+ "epoch": 0.42447458329019566,
621
+ "eval_loss": 4.677113056182861,
622
+ "eval_runtime": 44.5273,
623
+ "eval_samples_per_second": 96.413,
624
+ "eval_steps_per_second": 48.218,
625
+ "step": 1025
626
+ },
627
+ {
628
+ "epoch": 0.4348276219070297,
629
+ "grad_norm": 0.6343373656272888,
630
+ "learning_rate": 5.6862351113520505e-06,
631
+ "loss": 39.680654296875,
632
+ "step": 1050
633
+ },
634
+ {
635
+ "epoch": 0.4348276219070297,
636
+ "eval_loss": 4.6747050285339355,
637
+ "eval_runtime": 46.9899,
638
+ "eval_samples_per_second": 91.36,
639
+ "eval_steps_per_second": 45.691,
640
+ "step": 1050
641
+ },
642
+ {
643
+ "epoch": 0.44518066052386374,
644
+ "grad_norm": 0.6286041140556335,
645
+ "learning_rate": 5.119282452275787e-06,
646
+ "loss": 38.24800048828125,
647
+ "step": 1075
648
+ },
649
+ {
650
+ "epoch": 0.44518066052386374,
651
+ "eval_loss": 4.674582481384277,
652
+ "eval_runtime": 45.6532,
653
+ "eval_samples_per_second": 94.035,
654
+ "eval_steps_per_second": 47.029,
655
+ "step": 1075
656
+ },
657
+ {
658
+ "epoch": 0.4555336991406978,
659
+ "grad_norm": 0.6874698400497437,
660
+ "learning_rate": 4.574744789812638e-06,
661
+ "loss": 39.356064453125,
662
+ "step": 1100
663
+ },
664
+ {
665
+ "epoch": 0.4555336991406978,
666
+ "eval_loss": 4.672372817993164,
667
+ "eval_runtime": 43.1981,
668
+ "eval_samples_per_second": 99.379,
669
+ "eval_steps_per_second": 49.701,
670
+ "step": 1100
671
+ },
672
+ {
673
+ "epoch": 0.4658867377575318,
674
+ "grad_norm": 0.6946823596954346,
675
+ "learning_rate": 4.054275866946371e-06,
676
+ "loss": 39.3052978515625,
677
+ "step": 1125
678
+ },
679
+ {
680
+ "epoch": 0.4658867377575318,
681
+ "eval_loss": 4.669936656951904,
682
+ "eval_runtime": 43.411,
683
+ "eval_samples_per_second": 98.892,
684
+ "eval_steps_per_second": 49.458,
685
+ "step": 1125
686
+ },
687
+ {
688
+ "epoch": 0.4762397763743659,
689
+ "grad_norm": 0.6024668216705322,
690
+ "learning_rate": 3.559456330689684e-06,
691
+ "loss": 39.346806640625,
692
+ "step": 1150
693
+ },
694
+ {
695
+ "epoch": 0.4762397763743659,
696
+ "eval_loss": 4.66926383972168,
697
+ "eval_runtime": 43.0791,
698
+ "eval_samples_per_second": 99.654,
699
+ "eval_steps_per_second": 49.838,
700
+ "step": 1150
701
+ },
702
+ {
703
+ "epoch": 0.48659281499119994,
704
+ "grad_norm": 0.793952465057373,
705
+ "learning_rate": 3.091788931711123e-06,
706
+ "loss": 40.23644287109375,
707
+ "step": 1175
708
+ },
709
+ {
710
+ "epoch": 0.48659281499119994,
711
+ "eval_loss": 4.668498992919922,
712
+ "eval_runtime": 43.1576,
713
+ "eval_samples_per_second": 99.473,
714
+ "eval_steps_per_second": 49.748,
715
+ "step": 1175
716
+ },
717
+ {
718
+ "epoch": 0.49694585360803395,
719
+ "grad_norm": 0.7388107776641846,
720
+ "learning_rate": 2.652693960530743e-06,
721
+ "loss": 39.25404541015625,
722
+ "step": 1200
723
+ },
724
+ {
725
+ "epoch": 0.49694585360803395,
726
+ "eval_loss": 4.667989730834961,
727
+ "eval_runtime": 42.963,
728
+ "eval_samples_per_second": 99.923,
729
+ "eval_steps_per_second": 49.973,
730
+ "step": 1200
731
+ },
732
+ {
733
+ "epoch": 0.507298892224868,
734
+ "grad_norm": 0.671241044998169,
735
+ "learning_rate": 2.243504934144444e-06,
736
+ "loss": 39.02633544921875,
737
+ "step": 1225
738
+ },
739
+ {
740
+ "epoch": 0.507298892224868,
741
+ "eval_loss": 4.666492938995361,
742
+ "eval_runtime": 43.067,
743
+ "eval_samples_per_second": 99.682,
744
+ "eval_steps_per_second": 49.853,
745
+ "step": 1225
746
+ },
747
+ {
748
+ "epoch": 0.517651930841702,
749
+ "grad_norm": 0.7311628460884094,
750
+ "learning_rate": 1.8654645461766941e-06,
751
+ "loss": 39.51279541015625,
752
+ "step": 1250
753
+ },
754
+ {
755
+ "epoch": 0.517651930841702,
756
+ "eval_loss": 4.665400981903076,
757
+ "eval_runtime": 221.1942,
758
+ "eval_samples_per_second": 19.408,
759
+ "eval_steps_per_second": 9.706,
760
+ "step": 1250
761
+ },
762
+ {
763
+ "epoch": 0.528004969458536,
764
+ "grad_norm": 0.6689982414245605,
765
+ "learning_rate": 1.5197208928608375e-06,
766
+ "loss": 38.7842919921875,
767
+ "step": 1275
768
+ },
769
+ {
770
+ "epoch": 0.528004969458536,
771
+ "eval_loss": 4.666904926300049,
772
+ "eval_runtime": 44.7733,
773
+ "eval_samples_per_second": 95.883,
774
+ "eval_steps_per_second": 47.953,
775
+ "step": 1275
776
+ },
777
+ {
778
+ "epoch": 0.5383580080753702,
779
+ "grad_norm": 0.5878821015357971,
780
+ "learning_rate": 1.2073239863085644e-06,
781
+ "loss": 39.191982421875,
782
+ "step": 1300
783
+ },
784
+ {
785
+ "epoch": 0.5383580080753702,
786
+ "eval_loss": 4.666422367095947,
787
+ "eval_runtime": 44.9701,
788
+ "eval_samples_per_second": 95.463,
789
+ "eval_steps_per_second": 47.743,
790
+ "step": 1300
791
+ },
792
+ {
793
+ "epoch": 0.5487110466922042,
794
+ "grad_norm": 0.9803292155265808,
795
+ "learning_rate": 9.292225656576406e-07,
796
+ "loss": 38.61263671875,
797
+ "step": 1325
798
+ },
799
+ {
800
+ "epoch": 0.5487110466922042,
801
+ "eval_loss": 4.665764331817627,
802
+ "eval_runtime": 45.6356,
803
+ "eval_samples_per_second": 94.071,
804
+ "eval_steps_per_second": 47.047,
805
+ "step": 1325
806
+ },
807
+ {
808
+ "epoch": 0.5590640853090382,
809
+ "grad_norm": 0.7317976951599121,
810
+ "learning_rate": 6.862612157823259e-07,
811
+ "loss": 39.42274169921875,
812
+ "step": 1350
813
+ },
814
+ {
815
+ "epoch": 0.5590640853090382,
816
+ "eval_loss": 4.665533542633057,
817
+ "eval_runtime": 44.7641,
818
+ "eval_samples_per_second": 95.903,
819
+ "eval_steps_per_second": 47.963,
820
+ "step": 1350
821
+ },
822
+ {
823
+ "epoch": 0.5694171239258723,
824
+ "grad_norm": 0.7013985514640808,
825
+ "learning_rate": 4.7917780231687e-07,
826
+ "loss": 39.69686767578125,
827
+ "step": 1375
828
+ },
829
+ {
830
+ "epoch": 0.5694171239258723,
831
+ "eval_loss": 4.665374279022217,
832
+ "eval_runtime": 43.6247,
833
+ "eval_samples_per_second": 98.408,
834
+ "eval_steps_per_second": 49.215,
835
+ "step": 1375
836
+ },
837
+ {
838
+ "epoch": 0.5797701625427063,
839
+ "grad_norm": 0.7620524764060974,
840
+ "learning_rate": 3.0860123078183255e-07,
841
+ "loss": 38.945185546875,
842
+ "step": 1400
843
+ },
844
+ {
845
+ "epoch": 0.5797701625427063,
846
+ "eval_loss": 4.66505241394043,
847
+ "eval_runtime": 43.4121,
848
+ "eval_samples_per_second": 98.89,
849
+ "eval_steps_per_second": 49.456,
850
+ "step": 1400
851
+ },
852
+ {
853
+ "epoch": 0.5901232011595403,
854
+ "grad_norm": 0.6809831857681274,
855
+ "learning_rate": 1.7504953661868912e-07,
856
+ "loss": 38.79550048828125,
857
+ "step": 1425
858
+ },
859
+ {
860
+ "epoch": 0.5901232011595403,
861
+ "eval_loss": 4.664721488952637,
862
+ "eval_runtime": 43.3122,
863
+ "eval_samples_per_second": 99.118,
864
+ "eval_steps_per_second": 49.57,
865
+ "step": 1425
866
+ },
867
+ {
868
+ "epoch": 0.6004762397763743,
869
+ "grad_norm": 0.7748914957046509,
870
+ "learning_rate": 7.89283119332157e-08,
871
+ "loss": 38.775634765625,
872
+ "step": 1450
873
+ },
874
+ {
875
+ "epoch": 0.6004762397763743,
876
+ "eval_loss": 4.664709091186523,
877
+ "eval_runtime": 43.4487,
878
+ "eval_samples_per_second": 98.806,
879
+ "eval_steps_per_second": 49.415,
880
+ "step": 1450
881
+ },
882
+ {
883
+ "epoch": 0.6108292783932084,
884
+ "grad_norm": 0.5979882478713989,
885
+ "learning_rate": 2.0529473725605652e-08,
886
+ "loss": 39.74494384765625,
887
+ "step": 1475
888
+ },
889
+ {
890
+ "epoch": 0.6108292783932084,
891
+ "eval_loss": 4.664707660675049,
892
+ "eval_runtime": 43.4853,
893
+ "eval_samples_per_second": 98.723,
894
+ "eval_steps_per_second": 49.373,
895
+ "step": 1475
896
+ },
897
+ {
898
+ "epoch": 0.6211823170100425,
899
+ "grad_norm": 0.7802590131759644,
900
+ "learning_rate": 3.0377348117505145e-11,
901
+ "loss": 38.9613037109375,
902
+ "step": 1500
903
+ },
904
+ {
905
+ "epoch": 0.6211823170100425,
906
+ "eval_loss": 4.664709091186523,
907
+ "eval_runtime": 122.0162,
908
+ "eval_samples_per_second": 35.184,
909
+ "eval_steps_per_second": 17.596,
910
+ "step": 1500
911
+ }
912
+ ],
913
+ "logging_steps": 25,
914
+ "max_steps": 1500,
915
+ "num_input_tokens_seen": 0,
916
+ "num_train_epochs": 1,
917
+ "save_steps": 750,
918
+ "stateful_callbacks": {
919
+ "TrainerControl": {
920
+ "args": {
921
+ "should_epoch_stop": false,
922
+ "should_evaluate": false,
923
+ "should_log": false,
924
+ "should_save": true,
925
+ "should_training_stop": true
926
+ },
927
+ "attributes": {}
928
+ }
929
+ },
930
+ "total_flos": 1149555179520000.0,
931
+ "train_batch_size": 2,
932
+ "trial_name": null,
933
+ "trial_params": null
934
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2fc6a0a094cb5c9a6e42317044c60b442bf2605691d8bc9207b0c529a660502
3
+ size 5457
checkpoint-750/README.md ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: amazon/chronos-t5-small
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:amazon/chronos-t5-small
6
+ - lora
7
+ - transformers
8
+ ---
9
+
10
+ # Model Card for Model ID
11
+
12
+ <!-- Provide a quick summary of what the model is/does. -->
13
+
14
+
15
+
16
+ ## Model Details
17
+
18
+ ### Model Description
19
+
20
+ <!-- Provide a longer summary of what this model is. -->
21
+
22
+
23
+
24
+ - **Developed by:** [More Information Needed]
25
+ - **Funded by [optional]:** [More Information Needed]
26
+ - **Shared by [optional]:** [More Information Needed]
27
+ - **Model type:** [More Information Needed]
28
+ - **Language(s) (NLP):** [More Information Needed]
29
+ - **License:** [More Information Needed]
30
+ - **Finetuned from model [optional]:** [More Information Needed]
31
+
32
+ ### Model Sources [optional]
33
+
34
+ <!-- Provide the basic links for the model. -->
35
+
36
+ - **Repository:** [More Information Needed]
37
+ - **Paper [optional]:** [More Information Needed]
38
+ - **Demo [optional]:** [More Information Needed]
39
+
40
+ ## Uses
41
+
42
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
43
+
44
+ ### Direct Use
45
+
46
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
47
+
48
+ [More Information Needed]
49
+
50
+ ### Downstream Use [optional]
51
+
52
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
53
+
54
+ [More Information Needed]
55
+
56
+ ### Out-of-Scope Use
57
+
58
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
59
+
60
+ [More Information Needed]
61
+
62
+ ## Bias, Risks, and Limitations
63
+
64
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
65
+
66
+ [More Information Needed]
67
+
68
+ ### Recommendations
69
+
70
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
71
+
72
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
73
+
74
+ ## How to Get Started with the Model
75
+
76
+ Use the code below to get started with the model.
77
+
78
+ [More Information Needed]
79
+
80
+ ## Training Details
81
+
82
+ ### Training Data
83
+
84
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
85
+
86
+ [More Information Needed]
87
+
88
+ ### Training Procedure
89
+
90
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
91
+
92
+ #### Preprocessing [optional]
93
+
94
+ [More Information Needed]
95
+
96
+
97
+ #### Training Hyperparameters
98
+
99
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
100
+
101
+ #### Speeds, Sizes, Times [optional]
102
+
103
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
104
+
105
+ [More Information Needed]
106
+
107
+ ## Evaluation
108
+
109
+ <!-- This section describes the evaluation protocols and provides the results. -->
110
+
111
+ ### Testing Data, Factors & Metrics
112
+
113
+ #### Testing Data
114
+
115
+ <!-- This should link to a Dataset Card if possible. -->
116
+
117
+ [More Information Needed]
118
+
119
+ #### Factors
120
+
121
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
122
+
123
+ [More Information Needed]
124
+
125
+ #### Metrics
126
+
127
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
128
+
129
+ [More Information Needed]
130
+
131
+ ### Results
132
+
133
+ [More Information Needed]
134
+
135
+ #### Summary
136
+
137
+
138
+
139
+ ## Model Examination [optional]
140
+
141
+ <!-- Relevant interpretability work for the model goes here -->
142
+
143
+ [More Information Needed]
144
+
145
+ ## Environmental Impact
146
+
147
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
148
+
149
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
150
+
151
+ - **Hardware Type:** [More Information Needed]
152
+ - **Hours used:** [More Information Needed]
153
+ - **Cloud Provider:** [More Information Needed]
154
+ - **Compute Region:** [More Information Needed]
155
+ - **Carbon Emitted:** [More Information Needed]
156
+
157
+ ## Technical Specifications [optional]
158
+
159
+ ### Model Architecture and Objective
160
+
161
+ [More Information Needed]
162
+
163
+ ### Compute Infrastructure
164
+
165
+ [More Information Needed]
166
+
167
+ #### Hardware
168
+
169
+ [More Information Needed]
170
+
171
+ #### Software
172
+
173
+ [More Information Needed]
174
+
175
+ ## Citation [optional]
176
+
177
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
178
+
179
+ **BibTeX:**
180
+
181
+ [More Information Needed]
182
+
183
+ **APA:**
184
+
185
+ [More Information Needed]
186
+
187
+ ## Glossary [optional]
188
+
189
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
190
+
191
+ [More Information Needed]
192
+
193
+ ## More Information [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Authors [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Contact
202
+
203
+ [More Information Needed]
204
+ ### Framework versions
205
+
206
+ - PEFT 0.18.1
checkpoint-750/adapter_config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "amazon/chronos-t5-small",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "wo",
33
+ "q",
34
+ "v",
35
+ "o",
36
+ "k",
37
+ "wi"
38
+ ],
39
+ "target_parameters": null,
40
+ "task_type": "SEQ_2_SEQ_LM",
41
+ "trainable_token_indices": null,
42
+ "use_dora": false,
43
+ "use_qalora": false,
44
+ "use_rslora": false
45
+ }
checkpoint-750/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74ad9c4192071e1828c2569d41578cc93e5e765303d67fb476d048e7ca38e371
3
+ size 34675328
checkpoint-750/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f29b23d0907be3999bd9ae2003d9b82b00924c115f4cb8a413b5ad32f1768ca6
3
+ size 34759371
checkpoint-750/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:203d5fc4135888a6fb002fa823477a8f193b36896c0ab2c41be8b3adff903219
3
+ size 14391
checkpoint-750/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa54605a2dc63545c98573afb451d559983f127cc87c6edf86857f550d393a53
3
+ size 1465
checkpoint-750/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-750/tokenizer_config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "eos_token": "</s>",
4
+ "extra_ids": 100,
5
+ "extra_special_tokens": [
6
+ "<extra_id_0>",
7
+ "<extra_id_1>",
8
+ "<extra_id_2>",
9
+ "<extra_id_3>",
10
+ "<extra_id_4>",
11
+ "<extra_id_5>",
12
+ "<extra_id_6>",
13
+ "<extra_id_7>",
14
+ "<extra_id_8>",
15
+ "<extra_id_9>",
16
+ "<extra_id_10>",
17
+ "<extra_id_11>",
18
+ "<extra_id_12>",
19
+ "<extra_id_13>",
20
+ "<extra_id_14>",
21
+ "<extra_id_15>",
22
+ "<extra_id_16>",
23
+ "<extra_id_17>",
24
+ "<extra_id_18>",
25
+ "<extra_id_19>",
26
+ "<extra_id_20>",
27
+ "<extra_id_21>",
28
+ "<extra_id_22>",
29
+ "<extra_id_23>",
30
+ "<extra_id_24>",
31
+ "<extra_id_25>",
32
+ "<extra_id_26>",
33
+ "<extra_id_27>",
34
+ "<extra_id_28>",
35
+ "<extra_id_29>",
36
+ "<extra_id_30>",
37
+ "<extra_id_31>",
38
+ "<extra_id_32>",
39
+ "<extra_id_33>",
40
+ "<extra_id_34>",
41
+ "<extra_id_35>",
42
+ "<extra_id_36>",
43
+ "<extra_id_37>",
44
+ "<extra_id_38>",
45
+ "<extra_id_39>",
46
+ "<extra_id_40>",
47
+ "<extra_id_41>",
48
+ "<extra_id_42>",
49
+ "<extra_id_43>",
50
+ "<extra_id_44>",
51
+ "<extra_id_45>",
52
+ "<extra_id_46>",
53
+ "<extra_id_47>",
54
+ "<extra_id_48>",
55
+ "<extra_id_49>",
56
+ "<extra_id_50>",
57
+ "<extra_id_51>",
58
+ "<extra_id_52>",
59
+ "<extra_id_53>",
60
+ "<extra_id_54>",
61
+ "<extra_id_55>",
62
+ "<extra_id_56>",
63
+ "<extra_id_57>",
64
+ "<extra_id_58>",
65
+ "<extra_id_59>",
66
+ "<extra_id_60>",
67
+ "<extra_id_61>",
68
+ "<extra_id_62>",
69
+ "<extra_id_63>",
70
+ "<extra_id_64>",
71
+ "<extra_id_65>",
72
+ "<extra_id_66>",
73
+ "<extra_id_67>",
74
+ "<extra_id_68>",
75
+ "<extra_id_69>",
76
+ "<extra_id_70>",
77
+ "<extra_id_71>",
78
+ "<extra_id_72>",
79
+ "<extra_id_73>",
80
+ "<extra_id_74>",
81
+ "<extra_id_75>",
82
+ "<extra_id_76>",
83
+ "<extra_id_77>",
84
+ "<extra_id_78>",
85
+ "<extra_id_79>",
86
+ "<extra_id_80>",
87
+ "<extra_id_81>",
88
+ "<extra_id_82>",
89
+ "<extra_id_83>",
90
+ "<extra_id_84>",
91
+ "<extra_id_85>",
92
+ "<extra_id_86>",
93
+ "<extra_id_87>",
94
+ "<extra_id_88>",
95
+ "<extra_id_89>",
96
+ "<extra_id_90>",
97
+ "<extra_id_91>",
98
+ "<extra_id_92>",
99
+ "<extra_id_93>",
100
+ "<extra_id_94>",
101
+ "<extra_id_95>",
102
+ "<extra_id_96>",
103
+ "<extra_id_97>",
104
+ "<extra_id_98>",
105
+ "<extra_id_99>"
106
+ ],
107
+ "is_local": false,
108
+ "model_max_length": 1000000000000000019884624838656,
109
+ "pad_token": "<pad>",
110
+ "tokenizer_class": "T5Tokenizer",
111
+ "unk_token": "<unk>"
112
+ }
checkpoint-750/trainer_state.json ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 750,
3
+ "best_metric": 4.705667495727539,
4
+ "best_model_checkpoint": "/Users/alexgrigoras/Library/Mobile Documents/com~apple~CloudDocs/[5] Software/github/dif-pi/artifacts/models/sdg_chronos_t5_small_dunnhumby/checkpoint-750",
5
+ "epoch": 0.3105911585050212,
6
+ "eval_steps": 25,
7
+ "global_step": 750,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.01035303861683404,
14
+ "grad_norm": 0.2731291949748993,
15
+ "learning_rate": 8.000000000000001e-06,
16
+ "loss": 45.1662451171875,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.01035303861683404,
21
+ "eval_loss": 5.382182598114014,
22
+ "eval_runtime": 60.5419,
23
+ "eval_samples_per_second": 70.91,
24
+ "eval_steps_per_second": 35.463,
25
+ "step": 25
26
+ },
27
+ {
28
+ "epoch": 0.02070607723366808,
29
+ "grad_norm": 0.2949911952018738,
30
+ "learning_rate": 1.6333333333333335e-05,
31
+ "loss": 44.3224951171875,
32
+ "step": 50
33
+ },
34
+ {
35
+ "epoch": 0.02070607723366808,
36
+ "eval_loss": 5.343369960784912,
37
+ "eval_runtime": 47.1772,
38
+ "eval_samples_per_second": 90.997,
39
+ "eval_steps_per_second": 45.509,
40
+ "step": 50
41
+ },
42
+ {
43
+ "epoch": 0.031059115850502122,
44
+ "grad_norm": 0.2861124277114868,
45
+ "learning_rate": 2.466666666666667e-05,
46
+ "loss": 43.54326171875,
47
+ "step": 75
48
+ },
49
+ {
50
+ "epoch": 0.031059115850502122,
51
+ "eval_loss": 5.254676818847656,
52
+ "eval_runtime": 48.6137,
53
+ "eval_samples_per_second": 88.308,
54
+ "eval_steps_per_second": 44.164,
55
+ "step": 75
56
+ },
57
+ {
58
+ "epoch": 0.04141215446733616,
59
+ "grad_norm": 0.3088361918926239,
60
+ "learning_rate": 2.498250672211728e-05,
61
+ "loss": 43.87701171875,
62
+ "step": 100
63
+ },
64
+ {
65
+ "epoch": 0.04141215446733616,
66
+ "eval_loss": 5.151735305786133,
67
+ "eval_runtime": 46.8392,
68
+ "eval_samples_per_second": 91.654,
69
+ "eval_steps_per_second": 45.838,
70
+ "step": 100
71
+ },
72
+ {
73
+ "epoch": 0.05176519308417021,
74
+ "grad_norm": 0.36366939544677734,
75
+ "learning_rate": 2.4927134858925575e-05,
76
+ "loss": 41.4726220703125,
77
+ "step": 125
78
+ },
79
+ {
80
+ "epoch": 0.05176519308417021,
81
+ "eval_loss": 5.072572231292725,
82
+ "eval_runtime": 43.9205,
83
+ "eval_samples_per_second": 97.745,
84
+ "eval_steps_per_second": 48.884,
85
+ "step": 125
86
+ },
87
+ {
88
+ "epoch": 0.062118231701004244,
89
+ "grad_norm": 0.44364720582962036,
90
+ "learning_rate": 2.4834022195605383e-05,
91
+ "loss": 41.1882958984375,
92
+ "step": 150
93
+ },
94
+ {
95
+ "epoch": 0.062118231701004244,
96
+ "eval_loss": 4.995686054229736,
97
+ "eval_runtime": 53.0192,
98
+ "eval_samples_per_second": 80.971,
99
+ "eval_steps_per_second": 40.495,
100
+ "step": 150
101
+ },
102
+ {
103
+ "epoch": 0.07247127031783829,
104
+ "grad_norm": 0.4700476825237274,
105
+ "learning_rate": 2.470345151225491e-05,
106
+ "loss": 41.691572265625,
107
+ "step": 175
108
+ },
109
+ {
110
+ "epoch": 0.07247127031783829,
111
+ "eval_loss": 4.918369293212891,
112
+ "eval_runtime": 47.97,
113
+ "eval_samples_per_second": 89.493,
114
+ "eval_steps_per_second": 44.757,
115
+ "step": 175
116
+ },
117
+ {
118
+ "epoch": 0.08282430893467232,
119
+ "grad_norm": 0.42324015498161316,
120
+ "learning_rate": 2.4535819347748074e-05,
121
+ "loss": 39.7604541015625,
122
+ "step": 200
123
+ },
124
+ {
125
+ "epoch": 0.08282430893467232,
126
+ "eval_loss": 4.853856086730957,
127
+ "eval_runtime": 45.9268,
128
+ "eval_samples_per_second": 93.475,
129
+ "eval_steps_per_second": 46.748,
130
+ "step": 200
131
+ },
132
+ {
133
+ "epoch": 0.09317734755150636,
134
+ "grad_norm": 0.4664323925971985,
135
+ "learning_rate": 2.433163479545898e-05,
136
+ "loss": 40.16502197265625,
137
+ "step": 225
138
+ },
139
+ {
140
+ "epoch": 0.09317734755150636,
141
+ "eval_loss": 4.832671642303467,
142
+ "eval_runtime": 49.0803,
143
+ "eval_samples_per_second": 87.469,
144
+ "eval_steps_per_second": 43.745,
145
+ "step": 225
146
+ },
147
+ {
148
+ "epoch": 0.10353038616834041,
149
+ "grad_norm": 0.475277304649353,
150
+ "learning_rate": 2.4091517957162068e-05,
151
+ "loss": 40.338056640625,
152
+ "step": 250
153
+ },
154
+ {
155
+ "epoch": 0.10353038616834041,
156
+ "eval_loss": 4.815681457519531,
157
+ "eval_runtime": 45.7189,
158
+ "eval_samples_per_second": 93.9,
159
+ "eval_steps_per_second": 46.961,
160
+ "step": 250
161
+ },
162
+ {
163
+ "epoch": 0.11388342478517445,
164
+ "grad_norm": 0.46192488074302673,
165
+ "learning_rate": 2.3816198059803415e-05,
166
+ "loss": 39.81144287109375,
167
+ "step": 275
168
+ },
169
+ {
170
+ "epoch": 0.11388342478517445,
171
+ "eval_loss": 4.8025665283203125,
172
+ "eval_runtime": 44.0374,
173
+ "eval_samples_per_second": 97.485,
174
+ "eval_steps_per_second": 48.754,
175
+ "step": 275
176
+ },
177
+ {
178
+ "epoch": 0.12423646340200849,
179
+ "grad_norm": 0.44529587030410767,
180
+ "learning_rate": 2.350651124086246e-05,
181
+ "loss": 40.06572509765625,
182
+ "step": 300
183
+ },
184
+ {
185
+ "epoch": 0.12423646340200849,
186
+ "eval_loss": 4.791704177856445,
187
+ "eval_runtime": 46.8164,
188
+ "eval_samples_per_second": 91.699,
189
+ "eval_steps_per_second": 45.86,
190
+ "step": 300
191
+ },
192
+ {
193
+ "epoch": 0.13458950201884254,
194
+ "grad_norm": 0.47136980295181274,
195
+ "learning_rate": 2.316339800902997e-05,
196
+ "loss": 39.34464599609375,
197
+ "step": 325
198
+ },
199
+ {
200
+ "epoch": 0.13458950201884254,
201
+ "eval_loss": 4.781772136688232,
202
+ "eval_runtime": 46.7709,
203
+ "eval_samples_per_second": 91.788,
204
+ "eval_steps_per_second": 45.905,
205
+ "step": 325
206
+ },
207
+ {
208
+ "epoch": 0.14494254063567658,
209
+ "grad_norm": 0.5043098330497742,
210
+ "learning_rate": 2.2787900387914035e-05,
211
+ "loss": 40.12859619140625,
212
+ "step": 350
213
+ },
214
+ {
215
+ "epoch": 0.14494254063567658,
216
+ "eval_loss": 4.77421236038208,
217
+ "eval_runtime": 46.6372,
218
+ "eval_samples_per_second": 92.051,
219
+ "eval_steps_per_second": 46.036,
220
+ "step": 350
221
+ },
222
+ {
223
+ "epoch": 0.1552955792525106,
224
+ "grad_norm": 0.43915286660194397,
225
+ "learning_rate": 2.238115875144865e-05,
226
+ "loss": 40.11051513671875,
227
+ "step": 375
228
+ },
229
+ {
230
+ "epoch": 0.1552955792525106,
231
+ "eval_loss": 4.768870830535889,
232
+ "eval_runtime": 46.1314,
233
+ "eval_samples_per_second": 93.06,
234
+ "eval_steps_per_second": 46.541,
235
+ "step": 375
236
+ },
237
+ {
238
+ "epoch": 0.16564861786934465,
239
+ "grad_norm": 0.4874376058578491,
240
+ "learning_rate": 2.1944408360615527e-05,
241
+ "loss": 40.54395751953125,
242
+ "step": 400
243
+ },
244
+ {
245
+ "epoch": 0.16564861786934465,
246
+ "eval_loss": 4.759785175323486,
247
+ "eval_runtime": 45.0656,
248
+ "eval_samples_per_second": 95.261,
249
+ "eval_steps_per_second": 47.642,
250
+ "step": 400
251
+ },
252
+ {
253
+ "epoch": 0.1760016564861787,
254
+ "grad_norm": 0.47157636284828186,
255
+ "learning_rate": 2.147897561199711e-05,
256
+ "loss": 38.57564208984375,
257
+ "step": 425
258
+ },
259
+ {
260
+ "epoch": 0.1760016564861787,
261
+ "eval_loss": 4.7538371086120605,
262
+ "eval_runtime": 45.1789,
263
+ "eval_samples_per_second": 95.022,
264
+ "eval_steps_per_second": 47.522,
265
+ "step": 425
266
+ },
267
+ {
268
+ "epoch": 0.18635469510301272,
269
+ "grad_norm": 0.5695982575416565,
270
+ "learning_rate": 2.0986274009553747e-05,
271
+ "loss": 40.2056494140625,
272
+ "step": 450
273
+ },
274
+ {
275
+ "epoch": 0.18635469510301272,
276
+ "eval_loss": 4.746274471282959,
277
+ "eval_runtime": 48.2349,
278
+ "eval_samples_per_second": 89.002,
279
+ "eval_steps_per_second": 44.511,
280
+ "step": 450
281
+ },
282
+ {
283
+ "epoch": 0.19670773371984676,
284
+ "grad_norm": 0.4859912097454071,
285
+ "learning_rate": 2.0467799871858624e-05,
286
+ "loss": 39.90147705078125,
287
+ "step": 475
288
+ },
289
+ {
290
+ "epoch": 0.19670773371984676,
291
+ "eval_loss": 4.741403579711914,
292
+ "eval_runtime": 47.2353,
293
+ "eval_samples_per_second": 90.885,
294
+ "eval_steps_per_second": 45.453,
295
+ "step": 475
296
+ },
297
+ {
298
+ "epoch": 0.20706077233668083,
299
+ "grad_norm": 0.5383442640304565,
300
+ "learning_rate": 1.9925127787827415e-05,
301
+ "loss": 39.66552001953125,
302
+ "step": 500
303
+ },
304
+ {
305
+ "epoch": 0.20706077233668083,
306
+ "eval_loss": 4.736755847930908,
307
+ "eval_runtime": 43.4301,
308
+ "eval_samples_per_second": 98.849,
309
+ "eval_steps_per_second": 49.436,
310
+ "step": 500
311
+ },
312
+ {
313
+ "epoch": 0.21741381095351486,
314
+ "grad_norm": 0.47965624928474426,
315
+ "learning_rate": 1.9359905834743513e-05,
316
+ "loss": 39.6004296875,
317
+ "step": 525
318
+ },
319
+ {
320
+ "epoch": 0.21741381095351486,
321
+ "eval_loss": 4.732944011688232,
322
+ "eval_runtime": 43.41,
323
+ "eval_samples_per_second": 98.894,
324
+ "eval_steps_per_second": 49.459,
325
+ "step": 525
326
+ },
327
+ {
328
+ "epoch": 0.2277668495703489,
329
+ "grad_norm": 0.5654281973838806,
330
+ "learning_rate": 1.8773850573101503e-05,
331
+ "loss": 39.6916064453125,
332
+ "step": 550
333
+ },
334
+ {
335
+ "epoch": 0.2277668495703489,
336
+ "eval_loss": 4.729019641876221,
337
+ "eval_runtime": 43.337,
338
+ "eval_samples_per_second": 99.061,
339
+ "eval_steps_per_second": 49.542,
340
+ "step": 550
341
+ },
342
+ {
343
+ "epoch": 0.23811988818718294,
344
+ "grad_norm": 0.562452495098114,
345
+ "learning_rate": 1.8168741833469327e-05,
346
+ "loss": 39.9837548828125,
347
+ "step": 575
348
+ },
349
+ {
350
+ "epoch": 0.23811988818718294,
351
+ "eval_loss": 4.7265305519104,
352
+ "eval_runtime": 43.4156,
353
+ "eval_samples_per_second": 98.882,
354
+ "eval_steps_per_second": 49.452,
355
+ "step": 575
356
+ },
357
+ {
358
+ "epoch": 0.24847292680401697,
359
+ "grad_norm": 0.6212955117225647,
360
+ "learning_rate": 1.7546417311201357e-05,
361
+ "loss": 39.41627685546875,
362
+ "step": 600
363
+ },
364
+ {
365
+ "epoch": 0.24847292680401697,
366
+ "eval_loss": 4.7227606773376465,
367
+ "eval_runtime": 43.4821,
368
+ "eval_samples_per_second": 98.73,
369
+ "eval_steps_per_second": 49.377,
370
+ "step": 600
371
+ },
372
+ {
373
+ "epoch": 0.258825965420851,
374
+ "grad_norm": 0.54314124584198,
375
+ "learning_rate": 1.690876698541802e-05,
376
+ "loss": 39.156318359375,
377
+ "step": 625
378
+ },
379
+ {
380
+ "epoch": 0.258825965420851,
381
+ "eval_loss": 4.717469215393066,
382
+ "eval_runtime": 43.0545,
383
+ "eval_samples_per_second": 99.711,
384
+ "eval_steps_per_second": 49.867,
385
+ "step": 625
386
+ },
387
+ {
388
+ "epoch": 0.2691790040376851,
389
+ "grad_norm": 0.6269752383232117,
390
+ "learning_rate": 1.625772737920128e-05,
391
+ "loss": 39.20115966796875,
392
+ "step": 650
393
+ },
394
+ {
395
+ "epoch": 0.2691790040376851,
396
+ "eval_loss": 4.716719150543213,
397
+ "eval_runtime": 120.2804,
398
+ "eval_samples_per_second": 35.692,
399
+ "eval_steps_per_second": 17.85,
400
+ "step": 650
401
+ },
402
+ {
403
+ "epoch": 0.2795320426545191,
404
+ "grad_norm": 0.5073297023773193,
405
+ "learning_rate": 1.5595275678437756e-05,
406
+ "loss": 39.50381591796875,
407
+ "step": 675
408
+ },
409
+ {
410
+ "epoch": 0.2795320426545191,
411
+ "eval_loss": 4.712583065032959,
412
+ "eval_runtime": 43.4686,
413
+ "eval_samples_per_second": 98.761,
414
+ "eval_steps_per_second": 49.392,
415
+ "step": 675
416
+ },
417
+ {
418
+ "epoch": 0.28988508127135315,
419
+ "grad_norm": 0.5422746539115906,
420
+ "learning_rate": 1.4923423727170106e-05,
421
+ "loss": 38.739453125,
422
+ "step": 700
423
+ },
424
+ {
425
+ "epoch": 0.28988508127135315,
426
+ "eval_loss": 4.711677074432373,
427
+ "eval_runtime": 45.4531,
428
+ "eval_samples_per_second": 94.449,
429
+ "eval_steps_per_second": 47.235,
430
+ "step": 700
431
+ },
432
+ {
433
+ "epoch": 0.30023811988818716,
434
+ "grad_norm": 0.5396411418914795,
435
+ "learning_rate": 1.4244211917692812e-05,
436
+ "loss": 38.6535791015625,
437
+ "step": 725
438
+ },
439
+ {
440
+ "epoch": 0.30023811988818716,
441
+ "eval_loss": 4.707785606384277,
442
+ "eval_runtime": 45.9015,
443
+ "eval_samples_per_second": 93.526,
444
+ "eval_steps_per_second": 46.774,
445
+ "step": 725
446
+ },
447
+ {
448
+ "epoch": 0.3105911585050212,
449
+ "grad_norm": 0.6173298358917236,
450
+ "learning_rate": 1.355970299394786e-05,
451
+ "loss": 38.515927734375,
452
+ "step": 750
453
+ },
454
+ {
455
+ "epoch": 0.3105911585050212,
456
+ "eval_loss": 4.705667495727539,
457
+ "eval_runtime": 46.4706,
458
+ "eval_samples_per_second": 92.381,
459
+ "eval_steps_per_second": 46.201,
460
+ "step": 750
461
+ }
462
+ ],
463
+ "logging_steps": 25,
464
+ "max_steps": 1500,
465
+ "num_input_tokens_seen": 0,
466
+ "num_train_epochs": 1,
467
+ "save_steps": 750,
468
+ "stateful_callbacks": {
469
+ "TrainerControl": {
470
+ "args": {
471
+ "should_epoch_stop": false,
472
+ "should_evaluate": false,
473
+ "should_log": false,
474
+ "should_save": true,
475
+ "should_training_stop": false
476
+ },
477
+ "attributes": {}
478
+ }
479
+ },
480
+ "total_flos": 574777589760000.0,
481
+ "train_batch_size": 2,
482
+ "trial_name": null,
483
+ "trial_params": null
484
+ }
checkpoint-750/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2fc6a0a094cb5c9a6e42317044c60b442bf2605691d8bc9207b0c529a660502
3
+ size 5457
sdg_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "amazon/chronos-t5-small",
3
+ "base_model_id": "amazon/chronos-t5-small",
4
+ "context_length": 140,
5
+ "prediction_length": 30,
6
+ "num_bins": 4094,
7
+ "value_range": [
8
+ -5.0,
9
+ 5.0
10
+ ],
11
+ "learning_rate": 2.5e-05,
12
+ "train_steps": 1500,
13
+ "lora_rank": 32,
14
+ "lora_alpha": 64,
15
+ "batch_size": 2,
16
+ "gradient_accumulation_steps": 8,
17
+ "max_source_length": 768,
18
+ "max_target_length": 256,
19
+ "random_state": 42,
20
+ "task_prefix": "generate synthetic retail demand future from historical context",
21
+ "seasonality_strength": 0.75,
22
+ "seasonal_period": 7,
23
+ "seasonal_fallback_strength": 0.35,
24
+ "zero_threshold_for_sparsity": 0.6,
25
+ "prefer_backend": "qlora",
26
+ "use_special_tokens": true,
27
+ "add_calendar_features": true,
28
+ "warmup_ratio": 0.05,
29
+ "weight_decay": 0.01
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "eos_token": "</s>",
4
+ "extra_ids": 100,
5
+ "extra_special_tokens": [
6
+ "<extra_id_0>",
7
+ "<extra_id_1>",
8
+ "<extra_id_2>",
9
+ "<extra_id_3>",
10
+ "<extra_id_4>",
11
+ "<extra_id_5>",
12
+ "<extra_id_6>",
13
+ "<extra_id_7>",
14
+ "<extra_id_8>",
15
+ "<extra_id_9>",
16
+ "<extra_id_10>",
17
+ "<extra_id_11>",
18
+ "<extra_id_12>",
19
+ "<extra_id_13>",
20
+ "<extra_id_14>",
21
+ "<extra_id_15>",
22
+ "<extra_id_16>",
23
+ "<extra_id_17>",
24
+ "<extra_id_18>",
25
+ "<extra_id_19>",
26
+ "<extra_id_20>",
27
+ "<extra_id_21>",
28
+ "<extra_id_22>",
29
+ "<extra_id_23>",
30
+ "<extra_id_24>",
31
+ "<extra_id_25>",
32
+ "<extra_id_26>",
33
+ "<extra_id_27>",
34
+ "<extra_id_28>",
35
+ "<extra_id_29>",
36
+ "<extra_id_30>",
37
+ "<extra_id_31>",
38
+ "<extra_id_32>",
39
+ "<extra_id_33>",
40
+ "<extra_id_34>",
41
+ "<extra_id_35>",
42
+ "<extra_id_36>",
43
+ "<extra_id_37>",
44
+ "<extra_id_38>",
45
+ "<extra_id_39>",
46
+ "<extra_id_40>",
47
+ "<extra_id_41>",
48
+ "<extra_id_42>",
49
+ "<extra_id_43>",
50
+ "<extra_id_44>",
51
+ "<extra_id_45>",
52
+ "<extra_id_46>",
53
+ "<extra_id_47>",
54
+ "<extra_id_48>",
55
+ "<extra_id_49>",
56
+ "<extra_id_50>",
57
+ "<extra_id_51>",
58
+ "<extra_id_52>",
59
+ "<extra_id_53>",
60
+ "<extra_id_54>",
61
+ "<extra_id_55>",
62
+ "<extra_id_56>",
63
+ "<extra_id_57>",
64
+ "<extra_id_58>",
65
+ "<extra_id_59>",
66
+ "<extra_id_60>",
67
+ "<extra_id_61>",
68
+ "<extra_id_62>",
69
+ "<extra_id_63>",
70
+ "<extra_id_64>",
71
+ "<extra_id_65>",
72
+ "<extra_id_66>",
73
+ "<extra_id_67>",
74
+ "<extra_id_68>",
75
+ "<extra_id_69>",
76
+ "<extra_id_70>",
77
+ "<extra_id_71>",
78
+ "<extra_id_72>",
79
+ "<extra_id_73>",
80
+ "<extra_id_74>",
81
+ "<extra_id_75>",
82
+ "<extra_id_76>",
83
+ "<extra_id_77>",
84
+ "<extra_id_78>",
85
+ "<extra_id_79>",
86
+ "<extra_id_80>",
87
+ "<extra_id_81>",
88
+ "<extra_id_82>",
89
+ "<extra_id_83>",
90
+ "<extra_id_84>",
91
+ "<extra_id_85>",
92
+ "<extra_id_86>",
93
+ "<extra_id_87>",
94
+ "<extra_id_88>",
95
+ "<extra_id_89>",
96
+ "<extra_id_90>",
97
+ "<extra_id_91>",
98
+ "<extra_id_92>",
99
+ "<extra_id_93>",
100
+ "<extra_id_94>",
101
+ "<extra_id_95>",
102
+ "<extra_id_96>",
103
+ "<extra_id_97>",
104
+ "<extra_id_98>",
105
+ "<extra_id_99>"
106
+ ],
107
+ "is_local": false,
108
+ "model_max_length": 1000000000000000019884624838656,
109
+ "pad_token": "<pad>",
110
+ "tokenizer_class": "T5Tokenizer",
111
+ "unk_token": "<unk>"
112
+ }
training_info.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train_examples": 38636,
3
+ "eval_examples": 4293,
4
+ "train_steps": 1500,
5
+ "learning_rate": 2.5e-05,
6
+ "train_runtime": 4486.0918,
7
+ "train_loss": 39.88381803385417,
8
+ "is_peft_model": true,
9
+ "backend_name": "lora",
10
+ "added_special_tokens": 4131
11
+ }