MSGEncrypted commited on 15 days ago

Commit

2130d4e

verified ·

1 Parent(s): f93e317

Publish math-lora (gate passed: gsm8k)

Browse files

Files changed (24) hide show

README.md +64 -0
adapter_config.json +48 -0
adapter_model.safetensors +3 -0
chat_template.jinja +179 -0
checkpoint-100/README.md +207 -0
checkpoint-100/adapter_config.json +48 -0
checkpoint-100/adapter_model.safetensors +3 -0
checkpoint-100/optimizer.pt +3 -0
checkpoint-100/rng_state.pth +3 -0
checkpoint-100/scheduler.pt +3 -0
checkpoint-100/trainer_state.json +112 -0
checkpoint-100/training_args.bin +3 -0
checkpoint-150/README.md +207 -0
checkpoint-150/adapter_config.json +48 -0
checkpoint-150/adapter_model.safetensors +3 -0
checkpoint-150/optimizer.pt +3 -0
checkpoint-150/rng_state.pth +3 -0
checkpoint-150/scheduler.pt +3 -0
checkpoint-150/trainer_state.json +179 -0
checkpoint-150/training_args.bin +3 -0
tokenizer.json +0 -0
tokenizer_config.json +17 -0
training_args.bin +3 -0
training_results.json +46 -0

README.md ADDED Viewed

	@@ -0,0 +1,64 @@

+---
+library_name: peft
+base_model: openbmb/MiniCPM5-1B
+license: apache-2.0
+tags:
+  - lora
+  - qlora
+  - build-small-hackathon
+  - well-tuned
+  - math
+---
+# math-lora
+QLoRA adapter for **math**, fine-tuned from `openbmb/MiniCPM5-1B` on `meta-math/MetaMathQA` + `tatsu-lab/alpaca` (format: `mix`).
+Trained, evaluated, and gated on [Modal](https://modal.com/docs/guide) via `research/modal/` (app `slm-finetune-benchmark`).
+## Benchmark gate
+- eval profile: `math`
+- gate: **PASSED**
+| check | value | result |
+| --- | ---: | --- |
+| gsm8k >= 0.05 | 0.4000 | pass |
+| gsm8k improve >= 0.02 | 0.0700 | pass |
+| arc_challenge regress <= 0.03 | -0.0500 | pass |
+| hellaswag regress <= 0.03 | 0.0000 | pass |
+| piqa regress <= 0.03 | 0.0200 | pass |
+## lm-eval results
+| task | metric | baseline | candidate | delta |
+| --- | --- | ---: | ---: | ---: |
+| arc_challenge | acc,none | 0.3200 | 0.3700 | +0.0500 |
+| gsm8k | exact_match,strict-match | 0.3300 | 0.4000 | +0.0700 |
+| hellaswag | acc,none | 0.4300 | 0.4300 | +0.0000 |
+| piqa | acc,none | 0.7200 | 0.7000 | -0.0200 |
+## Training
+- dataset: `/repo/research/data/education-lesson-chat.jsonl`
+- mode: `qlora`
+- samples: {'train': 3528, 'eval': 72}
+- final train loss: 0.340698
+- eval loss: 0.494981
+## Load with PEFT
+```python
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+base = "openbmb/MiniCPM5-1B"
+adapter = "MSGEncrypted/minicpm5-1b-math-lora"
+tokenizer = AutoTokenizer.from_pretrained(base, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    base, torch_dtype="auto", device_map="auto", trust_remote_code=True
+)
+model = PeftModel.from_pretrained(model, adapter)
+```

adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "openbmb/MiniCPM5-1B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e024be6b763d331e1aa3095616eb6b46419aeaa1736c27e3de299a627e035cda
+size 89697856

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,179 @@

+{{- bos_token }}{%- if tools %}
+    {%- set tool_definitions %}
+        {{- "# Tools\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+        {%- for tool in tools %}
+            {{- "\n" }}
+            {{- tool | tojson(ensure_ascii=False) }}
+        {%- endfor %}
+        {{- '\n</tools>\n\nTool usage guidelines:\n- You may call zero or more functions. If no function calls are needed, just answer normally and do not include any <function ... </function>.\n- When calling a function, return an XML object within <function ... </function> using:\n<function name="function-name"><param name="param-name">param-value</param></function>\n- param-value may be multi-line. If it contains <, & or newline characters, wrap it in a CDATA block: <param name="param-name"><![CDATA[...multi-line value...]]></param>' }}
+    {%- endset %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if '<tool_def_sep>' in messages[0].content %}
+            {{- messages[0].content.replace('<tool_def_sep>', tool_definitions) }}
+        {%- else %}
+            {{- messages[0].content + '\n\n' + tool_definitions }}
+        {%- endif %}
+    {%- else %}
+        {{- tool_definitions.lstrip() }}
+    {%- endif %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- set content_parts = content.split('<tool_sep>') %}
+            {%- set processed_content = content_parts[0] %}
+            {%- set tool_calls_count = message.tool_calls|length %}
+            {%- set tool_sep_count = content_parts|length - 1 %}
+            {%- set min_count = [tool_calls_count, tool_sep_count]|min %}
+            {%- for i in range(1, content_parts|length) %}
+                {%- set tool_index = i - 1 %}
+                {%- if tool_index < tool_calls_count %}
+                    {%- set tool_call = message.tool_calls[tool_index] %}
+                    {%- if tool_call.function %}
+                        {%- set tool_call = tool_call.function %}
+                    {%- endif %}
+                    {%- set single_tool_xml %}
+                        {{- '<function name="' ~ tool_call.name ~ '">' }}
+                        {%- if tool_call.arguments %}
+                            {%- set args_dict = tool_call.arguments %}
+                            {%- for param_name, param_value in args_dict.items() %}
+                                {{- '<param name="' ~ param_name ~ '">' }}
+                                {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
+                                    {{- '<![CDATA[' + param_value + ']]>' }}
+                                {%- else %}
+                                    {{- param_value }}
+                                {%- endif %}
+                                {{- '</param>' }}
+                            {%- endfor %}
+                        {%- endif %}
+                        {{- '</function>' }}
+                    {%- endset %}
+                    {%- set processed_content = processed_content + single_tool_xml + content_parts[i] %}
+                {%- else %}
+                    {%- set processed_content = processed_content + content_parts[i] %}
+                {%- endif %}
+            {%- endfor %}
+            {%- if tool_calls_count > tool_sep_count %}
+                {%- for remaining_index in range(tool_sep_count, tool_calls_count) %}
+                    {%- set tool_call = message.tool_calls[remaining_index] %}
+                    {%- if tool_call.function %}
+                        {%- set tool_call = tool_call.function %}
+                    {%- endif %}
+                    {%- set remaining_tool_xml %}
+                        {{- '<function name="' ~ tool_call.name ~ '">' }}
+                        {%- if tool_call.arguments %}
+                            {%- set args_dict = tool_call.arguments %}
+                            {%- for param_name, param_value in args_dict.items() %}
+                                {{- '<param name="' ~ param_name ~ '">' }}
+                                {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
+                                    {{- '<![CDATA[' + param_value + ']]>' }}
+                                {%- else %}
+                                    {{- param_value }}
+                                {%- endif %}
+                                {{- '</param>' }}
+                            {%- endfor %}
+                        {%- endif %}
+                        {{- '</function>' }}
+                    {%- endset %}
+                    {%- set processed_content = processed_content + remaining_tool_xml %}
+                {%- endfor %}
+            {%- endif %}
+            {%- set content = processed_content %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if reasoning_content %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls and not has_tool_sep %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<function name="' ~ tool_call.name ~ '">' }}
+                {%- if tool_call.arguments %}
+                    {%- set args_dict = tool_call.arguments %}
+                    {%- for param_name, param_value in args_dict.items() %}
+                        {{- '<param name="' ~ param_name ~ '">' }}
+                        {%- if param_value is string and ('<' in param_value or '&' in param_value or '\n' in param_value) %}
+                            {{- '<![CDATA[' + param_value + ']]>' }}
+                        {%- else %}
+                            {{- param_value }}
+                        {%- endif %}
+                        {{- '</param>' }}
+                    {%- endfor %}
+                {%- endif %}
+                {{- '</function>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- content }}
+        {%- else %}
+            {{- message.content | tojson(ensure_ascii=False) }}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined %}
+        {%- if enable_thinking is false %}
+            {{- '<think>\n\n</think>\n\n' }}
+        {%- elif enable_thinking is true %}
+            {{- '<think>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endif %}

checkpoint-100/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: openbmb/MiniCPM5-1B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:openbmb/MiniCPM5-1B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoint-100/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "openbmb/MiniCPM5-1B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-100/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86cab059d6d4fdeda10343e5be6a7b6dd0425c781a470d6cd57e81310396f39d
+size 44871152

checkpoint-100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8c5824cce21e65493e41f51d5805e3504798bded2db69270d8689b2356eae38
+size 89940563

checkpoint-100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb6413690bb6ce9e7598b1601c4fc0ffcde007db8991ca720c47c83060128a23
+size 14645

checkpoint-100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76a2417c552f2bdc77f02e5895b11a8ab016d87e3e5eb1ba1c862a4cfd0fd110
+size 1465

checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.620408163265306,
+  "eval_steps": 200,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.16326530612244897,
+      "grad_norm": 0.6756201386451721,
+      "learning_rate": 0.0001981178176898239,
+      "loss": 0.770789909362793,
+      "step": 10
+    },
+    {
+      "epoch": 0.32653061224489793,
+      "grad_norm": 0.6545600295066833,
+      "learning_rate": 0.00018687117365181512,
+      "loss": 0.7264016151428223,
+      "step": 20
+    },
+    {
+      "epoch": 0.4897959183673469,
+      "grad_norm": 0.683688223361969,
+      "learning_rate": 0.00016659152250116812,
+      "loss": 0.7400761127471924,
+      "step": 30
+    },
+    {
+      "epoch": 0.6530612244897959,
+      "grad_norm": 0.47568151354789734,
+      "learning_rate": 0.00013938757562492873,
+      "loss": 0.7011314392089844,
+      "step": 40
+    },
+    {
+      "epoch": 0.8163265306122449,
+      "grad_norm": 0.42000776529312134,
+      "learning_rate": 0.00010808804403614043,
+      "loss": 0.672628116607666,
+      "step": 50
+    },
+    {
+      "epoch": 0.9795918367346939,
+      "grad_norm": 0.4876711070537567,
+      "learning_rate": 7.594750436337467e-05,
+      "loss": 0.5931291103363037,
+      "step": 60
+    },
+    {
+      "epoch": 1.1306122448979592,
+      "grad_norm": 0.5444473624229431,
+      "learning_rate": 4.630798263510162e-05,
+      "loss": 0.6527483463287354,
+      "step": 70
+    },
+    {
+      "epoch": 1.2938775510204081,
+      "grad_norm": 0.6376090049743652,
+      "learning_rate": 2.2251444932035094e-05,
+      "loss": 0.576920461654663,
+      "step": 80
+    },
+    {
+      "epoch": 1.457142857142857,
+      "grad_norm": 0.541858971118927,
+      "learning_rate": 6.2793294993656494e-06,
+      "loss": 0.5700631141662598,
+      "step": 90
+    },
+    {
+      "epoch": 1.620408163265306,
+      "grad_norm": 0.4305003583431244,
+      "learning_rate": 5.2443095448506674e-08,
+      "loss": 0.5232778549194336,
+      "step": 100
+    },
+    {
+      "epoch": 1.620408163265306,
+      "eval_loss": 0.8235033750534058,
+      "eval_runtime": 0.6803,
+      "eval_samples_per_second": 29.398,
+      "eval_steps_per_second": 7.349,
+      "step": 100
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 100,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3287268790345728.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd65839e9bd18646e61b40ae83f868c453857ea4d9939b9fd348d8bf2becd23e
+size 5201

checkpoint-150/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: openbmb/MiniCPM5-1B
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:openbmb/MiniCPM5-1B
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoint-150/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "openbmb/MiniCPM5-1B",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-150/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e024be6b763d331e1aa3095616eb6b46419aeaa1736c27e3de299a627e035cda
+size 89697856

checkpoint-150/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da736af699c86703b742434ea40618d38ffa85320a34d95c1bbcb7e5da923310
+size 179594003

checkpoint-150/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8077321b6911087294dc7b05c9a860579d6291a34cbd5abc1975ab53b3a0b0f3
+size 14645

checkpoint-150/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f847bf189265d8eee7da89b2c635bd5ad82bf85f644dadbc5646ac3ed438aa14
+size 1465

checkpoint-150/trainer_state.json ADDED Viewed

	@@ -0,0 +1,179 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6802721088435374,
+  "eval_steps": 30,
+  "global_step": 150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.045351473922902494,
+      "grad_norm": 0.5955151915550232,
+      "learning_rate": 0.00019962469712828614,
+      "loss": 0.4111457824707031,
+      "step": 10
+    },
+    {
+      "epoch": 0.09070294784580499,
+      "grad_norm": 0.5896233320236206,
+      "learning_rate": 0.00019543482507085482,
+      "loss": 0.35993859767913816,
+      "step": 20
+    },
+    {
+      "epoch": 0.1360544217687075,
+      "grad_norm": 0.48774707317352295,
+      "learning_rate": 0.00018678252666130013,
+      "loss": 0.38935122489929197,
+      "step": 30
+    },
+    {
+      "epoch": 0.1360544217687075,
+      "eval_loss": 0.5217297077178955,
+      "eval_runtime": 2.0775,
+      "eval_samples_per_second": 34.657,
+      "eval_steps_per_second": 8.664,
+      "step": 30
+    },
+    {
+      "epoch": 0.18140589569160998,
+      "grad_norm": 0.4432049095630646,
+      "learning_rate": 0.00017407237375691392,
+      "loss": 0.3316951274871826,
+      "step": 40
+    },
+    {
+      "epoch": 0.22675736961451248,
+      "grad_norm": 0.40836411714553833,
+      "learning_rate": 0.0001578986789811849,
+      "loss": 0.31510772705078127,
+      "step": 50
+    },
+    {
+      "epoch": 0.272108843537415,
+      "grad_norm": 0.3234673738479614,
+      "learning_rate": 0.00013901770632605547,
+      "loss": 0.3653845310211182,
+      "step": 60
+    },
+    {
+      "epoch": 0.272108843537415,
+      "eval_loss": 0.5070984363555908,
+      "eval_runtime": 2.0635,
+      "eval_samples_per_second": 34.891,
+      "eval_steps_per_second": 8.723,
+      "step": 60
+    },
+    {
+      "epoch": 0.31746031746031744,
+      "grad_norm": 0.28090140223503113,
+      "learning_rate": 0.00011831230908818563,
+      "loss": 0.2907076835632324,
+      "step": 70
+    },
+    {
+      "epoch": 0.36281179138321995,
+      "grad_norm": 0.33969420194625854,
+      "learning_rate": 9.675064863002196e-05,
+      "loss": 0.3239952325820923,
+      "step": 80
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "grad_norm": 0.3883151113986969,
+      "learning_rate": 7.534092423052381e-05,
+      "loss": 0.39945359230041505,
+      "step": 90
+    },
+    {
+      "epoch": 0.40816326530612246,
+      "eval_loss": 0.49975717067718506,
+      "eval_runtime": 2.0811,
+      "eval_samples_per_second": 34.597,
+      "eval_steps_per_second": 8.649,
+      "step": 90
+    },
+    {
+      "epoch": 0.45351473922902497,
+      "grad_norm": 0.37783923745155334,
+      "learning_rate": 5.5084230807412126e-05,
+      "loss": 0.33277587890625,
+      "step": 100
+    },
+    {
+      "epoch": 0.4988662131519274,
+      "grad_norm": 0.44748321175575256,
+      "learning_rate": 3.6927748831453836e-05,
+      "loss": 0.3025733470916748,
+      "step": 110
+    },
+    {
+      "epoch": 0.54421768707483,
+      "grad_norm": 0.39024776220321655,
+      "learning_rate": 2.1720455220364444e-05,
+      "loss": 0.30908067226409913,
+      "step": 120
+    },
+    {
+      "epoch": 0.54421768707483,
+      "eval_loss": 0.4952836036682129,
+      "eval_runtime": 2.0845,
+      "eval_samples_per_second": 34.541,
+      "eval_steps_per_second": 8.635,
+      "step": 120
+    },
+    {
+      "epoch": 0.5895691609977324,
+      "grad_norm": 0.39623892307281494,
+      "learning_rate": 1.0173426121705576e-05,
+      "loss": 0.3232832193374634,
+      "step": 130
+    },
+    {
+      "epoch": 0.6349206349206349,
+      "grad_norm": 0.3841199576854706,
+      "learning_rate": 2.826587782529444e-06,
+      "loss": 0.3398961782455444,
+      "step": 140
+    },
+    {
+      "epoch": 0.6802721088435374,
+      "grad_norm": 0.4336070120334625,
+      "learning_rate": 2.347019815158724e-08,
+      "loss": 0.34069836139678955,
+      "step": 150
+    },
+    {
+      "epoch": 0.6802721088435374,
+      "eval_loss": 0.49498093128204346,
+      "eval_runtime": 2.0755,
+      "eval_samples_per_second": 34.691,
+      "eval_steps_per_second": 8.673,
+      "step": 150
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 150,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4361616479379456.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-150/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c132835da72e5a98a55da309abaadcb858dfcd4c53b6c7dfaf23d45de30ee7a7
+size 5201

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "add_prefix_space": null,
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "is_local": false,
+  "legacy": true,
+  "local_files_only": false,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c132835da72e5a98a55da309abaadcb858dfcd4c53b6c7dfaf23d45de30ee7a7
+size 5201

training_results.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "model": "openbmb/MiniCPM5-1B",
+  "preset": "minicpm5-1b",
+  "dataset": "/repo/research/data/education-lesson-chat.jsonl",
+  "dataset_config": null,
+  "dataset_split": "train",
+  "mix": [
+    {
+      "dataset": "meta-math/MetaMathQA",
+      "format": "prompt",
+      "columns": {
+        "prompt": "query",
+        "response": "response"
+      },
+      "dataset_split": "train[:3000]",
+      "max_samples": 3000
+    },
+    {
+      "dataset": "tatsu-lab/alpaca",
+      "format": "alpaca",
+      "dataset_split": "train[:600]",
+      "max_samples": 600
+    }
+  ],
+  "format": "chat",
+  "mode": "qlora",
+  "output_dir": "/vol/finetuned/math-lora",
+  "samples": {
+    "train": 3528,
+    "eval": 72
+  },
+  "metrics": {
+    "final_train_loss": 0.340698,
+    "eval_loss": 0.494981,
+    "perplexity": 1.6405,
+    "loss_score": 0.494981,
+    "result_score": 60.96
+  },
+  "training": {
+    "epochs": 1.0,
+    "max_steps": 150,
+    "global_step": 150,
+    "train_runtime_sec": 310.98,
+    "train_samples_per_second": 7.718
+  }
+}