Instructions to use dangkhoa99/falcon-7b-finetuned-QA-MRC-4-bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use dangkhoa99/falcon-7b-finetuned-QA-MRC-4-bit with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("ybelkada/falcon-7b-sharded-bf16") model = PeftModel.from_pretrained(base_model, "dangkhoa99/falcon-7b-finetuned-QA-MRC-4-bit") - Notebooks
- Google Colab
- Kaggle
| from typing import Dict, List, Any | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| from peft import PeftModel, PeftConfig | |
| import torch | |
| import time | |
| class EndpointHandler: | |
| def __init__(self, path="dangkhoa99/falcon-7b-finetuned-QA-MRC-4-bit"): | |
| # load the model | |
| config = PeftConfig.from_pretrained(path) | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.float16, | |
| ) | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| config.base_model_name_or_path, | |
| return_dict=True, | |
| load_in_4bit=True, | |
| device_map={"":0}, | |
| trust_remote_code=True, | |
| quantization_config=bnb_config, | |
| ) | |
| self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| self.model = PeftModel.from_pretrained(self.model, path) | |
| def __call__(self, data: Any) -> Dict[str, Any]: | |
| """ | |
| Args: | |
| inputs :obj:`list`:. The object should be like {"context": "some word", "question": "some word"} containing: | |
| - "context": | |
| - "question": | |
| Return: | |
| A :obj:`list`:. The object returned should be like {"answer": "some word", time: "..."} containing: | |
| - "answer": answer the question based on the context | |
| - "time": the time run predict | |
| """ | |
| inputs = data.pop("inputs", data) | |
| context = inputs.pop("context", inputs) | |
| question = inputs.pop("question", inputs) | |
| prompt = f"""Answer the question based on the context below. If the question cannot be answered using the information provided answer with 'No answer'. Stop response if end. | |
| >>TITLE<<: Flawless answer. | |
| >>CONTEXT<<: {context} | |
| >>QUESTION<<: {question} | |
| >>ANSWER<<: | |
| """.strip() | |
| batch = self.tokenizer( | |
| prompt, | |
| padding=True, | |
| truncation=True, | |
| return_tensors='pt' | |
| ) | |
| batch = batch.to('cuda:0') | |
| generation_config = self.model.generation_config | |
| generation_config.top_p = 0.7 | |
| generation_config.temperature = 0.7 | |
| generation_config.max_new_tokens = 256 | |
| generation_config.num_return_sequences = 1 | |
| generation_config.pad_token_id = self.tokenizer.eos_token_id | |
| generation_config.eos_token_id = self.tokenizer.eos_token_id | |
| start = time.time() | |
| with torch.cuda.amp.autocast(): | |
| output_tokens = self.model.generate( | |
| input_ids = batch.input_ids, | |
| generation_config=generation_config, | |
| ) | |
| end = time.time() | |
| generated_text = self.tokenizer.decode(output_tokens[0]) | |
| prediction = {'answer': generated_text.split('>>END<<')[0].split('>>ANSWER<<:')[1].strip(), 'time': f"{(end-start):.2f} s"} | |
| return prediction |