Text Generation
PEFT
TensorBoard
Safetensors
Transformers
nemotron-nas
axolotl
lora
conversational
custom_code
8-bit precision
bitsandbytes
Instructions to use ConicCat/Nemo-super-wip-lora with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use ConicCat/Nemo-super-wip-lora with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("nvidia/Llama-3_3-Nemotron-Super-49B-v1_5") model = PeftModel.from_pretrained(base_model, "ConicCat/Nemo-super-wip-lora") - Transformers
How to use ConicCat/Nemo-super-wip-lora with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="ConicCat/Nemo-super-wip-lora", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("ConicCat/Nemo-super-wip-lora", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use ConicCat/Nemo-super-wip-lora with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "ConicCat/Nemo-super-wip-lora" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ConicCat/Nemo-super-wip-lora", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/ConicCat/Nemo-super-wip-lora
- SGLang
How to use ConicCat/Nemo-super-wip-lora with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "ConicCat/Nemo-super-wip-lora" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ConicCat/Nemo-super-wip-lora", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "ConicCat/Nemo-super-wip-lora" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "ConicCat/Nemo-super-wip-lora", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use ConicCat/Nemo-super-wip-lora with Docker Model Runner:
docker model run hf.co/ConicCat/Nemo-super-wip-lora
| import dataclasses | |
| import json | |
| import warnings | |
| from dataclasses import dataclass, MISSING | |
| from functools import partial | |
| from typing import Optional, Any | |
| class JsonComparable: | |
| def to_json(self) -> str: | |
| return json.dumps(dataclasses.asdict(self)) | |
| def __eq__(self, other: "JsonComparable") -> bool: | |
| return self.to_json() == other.to_json() | |
| def __hash__(self) -> int: | |
| return hash(self.to_json()) | |
| def __lt__(self, other: "JsonComparable") -> bool: | |
| return self.to_json() < other.to_json() | |
| class SubblockConfig(JsonComparable): | |
| no_op: bool = False | |
| replace_with_linear: bool = False | |
| sparsify: Optional[list[str]] = None | |
| def __post_init__(self): | |
| assert not (self.no_op and self.replace_with_linear) | |
| def _force_setattr(self, name: str, value: Any) -> None: | |
| """ | |
| Set an attribute even in frozen dataclasses. | |
| Use only inside __post_init__! | |
| """ | |
| object.__setattr__(self, name, value) | |
| class AttentionConfig(SubblockConfig): | |
| n_heads_in_group: Optional[int] = None | |
| window_length: Optional[int] = None | |
| num_sink_tokens: Optional[int] = None | |
| use_prefill_window_in_sink_attention: bool = False | |
| unshifted_sink: bool = False | |
| def __post_init__(self): | |
| super().__post_init__() | |
| assert not (self.no_op and self.replace_with_linear) | |
| if self.no_op or self.replace_with_linear: | |
| for irrelevant_att in ["n_heads_in_group", "window_length", "num_sink_tokens"]: | |
| self._force_setattr(irrelevant_att, None) | |
| else: | |
| assert self.n_heads_in_group is not None | |
| if self.is_sink: | |
| assert not (self.unshifted_sink and self.use_prefill_window_in_sink_attention), \ | |
| ("Unshifted sink uses its own kind of explicit masking, not standard window. " | |
| "Set use_prefill_window_in_sink_attention to False.") | |
| assert not (self.num_sink_tokens == 0 and not self.unshifted_sink), \ | |
| "Fake sink attention with 0 sink tokens is only supported with unshifted_sink=True" | |
| def prefill_sliding_window(self) -> Optional[int]: | |
| if self.window_length is not None: | |
| if not self.is_sink or self.use_prefill_window_in_sink_attention: | |
| return self.window_length | |
| return None | |
| def is_sliding(self) -> bool: | |
| return self.prefill_sliding_window is not None | |
| def is_sink(self) -> bool: | |
| return ( | |
| (self.window_length is not None) | |
| and | |
| (self.num_sink_tokens is not None) | |
| ) | |
| class FFNConfig(SubblockConfig): | |
| ffn_mult: Optional[float] = None | |
| def __post_init__(self): | |
| super().__post_init__() | |
| if self.no_op or self.replace_with_linear: | |
| self._force_setattr("ffn_mult", None) | |
| else: | |
| assert self.ffn_mult is not None | |
| self._force_setattr("ffn_mult", round(self.ffn_mult, 6)) | |
| class BlockConfig(JsonComparable): | |
| attention: AttentionConfig = MISSING | |
| ffn: FFNConfig = MISSING | |
| def __post_init__(self): | |
| """ | |
| Init subblock dataclasses from dicts | |
| """ | |
| for subblock_name in dataclasses.fields(self): | |
| subblock_config = getattr(self, subblock_name.name) | |
| if isinstance(subblock_config, dict): | |
| subblock_fields = [field.name for field in dataclasses.fields(subblock_name.type)] | |
| unsupported_fields = [field_name for field_name in subblock_config.keys() | |
| if field_name not in subblock_fields] | |
| if len(unsupported_fields) > 0: | |
| warnings.warn(f"Removed unsupported fields {unsupported_fields} from {subblock_name.type.__name__}") | |
| subblock_config = {k: v for k, v in subblock_config.items() if k not in unsupported_fields} | |
| object.__setattr__(self, subblock_name.name, | |
| subblock_name.type(**subblock_config)) # __setattr__ to overcome frozen=True | |