Text Generation
Transformers
qwen3
nebula-s
svms
math-reasoning
competition-math
quantized
int4
hqq
conversational
Instructions to use decompute/Nebula-S-v1-lite with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use decompute/Nebula-S-v1-lite with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="decompute/Nebula-S-v1-lite") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("decompute/Nebula-S-v1-lite") model = AutoModelForCausalLM.from_pretrained("decompute/Nebula-S-v1-lite") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use decompute/Nebula-S-v1-lite with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "decompute/Nebula-S-v1-lite" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "decompute/Nebula-S-v1-lite", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/decompute/Nebula-S-v1-lite
- SGLang
How to use decompute/Nebula-S-v1-lite with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "decompute/Nebula-S-v1-lite" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "decompute/Nebula-S-v1-lite", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "decompute/Nebula-S-v1-lite" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "decompute/Nebula-S-v1-lite", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use decompute/Nebula-S-v1-lite with Docker Model Runner:
docker model run hf.co/decompute/Nebula-S-v1-lite
| #!/usr/bin/env python3 | |
| """Nebula-S-v1-lite — pre-quantized int4 (cross-platform HQQ). | |
| Backbone is already quantized to int4 on disk. Works on Mac (MPS), CUDA, CPU. | |
| Requires: pip install torch transformers>=4.51.0 hqq | |
| Usage: | |
| from nebula_s import load_nebula_s | |
| model, tokenizer = load_nebula_s("./Nebula-S-v1-lite") | |
| """ | |
| import torch,json,os,base64,zlib,hashlib,types,sys | |
| _E0="/8ce5hKi1orFGntAvF36ynVVtY6N0eVm5t3bmuOVlYAPhpOCtWG82bEIubMDVQHwE8FwRiGbvR0K2HbLcOBvHSuJ29BdnUZu6Ur7umXbqSac4vwjoC2AUOqe1ChItG7MuTscqiq42CRJZYVSt1R+uiUbRroAjpUpBuZI3QbkfbUnHNdbz7q/wVN+hhUYsUze4My1XwG89Kgp0bmkEuaueIzzPNsiO/eGTrUEELDCz9oUHcGE2/v+HvAuijRN/FLQK+1rDOa1zPKgiaxqpHt/bZAiPhb11aqN7eW4WtN7WNkyiT3dv/9qNJWA6xd6o09M+5uEOkpgkg93XU+JHh654fYJTXL4s6EFEEnCjMOqfj8qWi9xOcxGq+8KlKfaWwRRQ2gM+uzjyswWJwQrlCWbZEqmkm0TTJBCz7HNn24WJAA5RA7gxQS7WoTRE7ex428STxjny8xjkVC36REt2rtOIpLlfdCb5TMtQ3tT7zdIwxTEhs+O8L1PZY1mTofHfwsCZjrFltvE8KNG80w/ml5pLAxgpweuSjZgGHlN2Y3Bf0vPbQs425Hj4SMWjlYXbccDgQPHJfLgXsmtDy6knlXzwAtXrjS4Bagc1jIrnGd1r8yUgzuQm/jFFe9Ddh4+iPHS5VyfbF74JixO8hiZMPNokDmzaN9KBnARKGLJVTcuc/GAmYcYYy3HeJppBqr5SjOx1O/BX00BSicLYZOM4ABfy4ag9a/A0Mayg42l/JagT8az/6zScUPtTam2JRv8zNmdK6KpP5lf2akgjfdDGcFnsV++mSwc8U1Z5a1IjM4vTqLIRbdnuiW/R8583hR4NoZ0Oiii4LdeM3+mCFe/08FrplE3n+wnwGypjHVEN6HXh+elqtP8UrbaKruAv5B5n2Imm3aYi1aCJVPQRqFhGMMFb/yaPqVISm6ksnVLMrJCqjmM+P8MtHkm/ajyImQhkfykO9GXX7BfoXfCxGjdF14a6Y6eJTbqmWHRkh3/i6MeTPj1B07ksMNVCWnIEFwjNb/qlJ0E" | |
| _E1="/8duKKRjL4nPYZra6ekNXniqiul7RtTLQgNd/pdWObS2STyWbr1mvHTYX4dzcQqmWdV+vgoHXYaAQNMo6kZYVfzo8EO40NhBh5HAOy03MqINspjpLC7PXpzofozedAqTSeO6E28ebAQyWQAMuYjF61Uc64AmJlwVZk98SCflX29uMRXKjCdTnx+257N5/HCQXElCgioW7rNJJ+waxqjRjeL6JBqtZuzEZ7bbVx9mESHEyflrhd5IKsZBb7g1tp/gMSZBgpYAsnmjAF1U19+AQAPrHDhO2uJwkL9HWmz5YKloM24+70gxH8ueRkZ+bvtGbGP7mYnEKv7LFbGlMYuP3bqUzNR52PPqWqL9kclj/AKU7i8+4IevaBdLqTxmlk+J/tj9yRqgaUJLgajCh20yZBhJ11BJypTAjQm5FKW4bi4ImBjugHDrwaMej62F7WIW/OXSE9MdxWvR69806EKBVwvgiGY+n2lTsgchka6krExRy4Q7zB9KUl53YZYfiyAU/+1VnON+77I/EGPmdh3aV/KaYMmu8FguYmwkY7YXP3VwYjl1h6sHl1BkQ/QGr35gM8ugCGYaFpev/DPvHcujzixOmV4XDNpzTPCOrBUhVSHcZBJn4e7kC4BWFV4D+3xVf9+r29ZuhNLmW721pL2tPGHyn/J/DIgv9hz7ON8Cc/UYXg85IPghabg81/FcIJPr2SDi34miKiryvMYxiHZTK2hXLpRLqY0SWKrlDfsccW60B4S81dcUC2K4NhHN9OTyDS5XgNSlXxDeveTvKvQppGIgftC6s7+nUJ3OUcbupg6czgK7doLLZFO9YRbCo0+gvp0yNcS8zPKNy+y//fOCKP3Mvmb4lMZBeYy/vZ3Acww3zlKrMBnIcbxJu10QuLDRXfyTQdl85fp5iuaShEpvcp9GX5Wi2nwivxcfzQcux+nGwmXFr0gQEyJpMkB+3jQ/jwh5koAVinT4zY5pExXu5JocaexXTqwtasf5lyO5apZb5bJpZvIroUVGoFWxWbfuObSmISUQdifx+rbTgO3QL6zeX8HrKkeUIuOAh/Kqkk0bqnBBz8wSZ1XniBIlaUDmbnra2pJYXxqQciT9iv/yN7Cx8sAObEG7hgGFcunBTqHC1yFEYJaNfPLyJhJ0hvfVjHF+IhoTOc5EY4Hsw5q2q/cRJFVJQFA2r6S/+6rL9Opy/wKy9IKPXjMN6L8uA0blB4pYWetrnWYDEn3I/zR56HOq7LIPPfr8+4i25gOupkvRkrnWMgdxY01MzbvN0cThn7mYP+CVloFUYi0b8oAXDKRnprLujetboNnkeqOnW0NJno6Zqib0pSEsPmBRl9KAA+LHgxBAhdddEDOe/RPIwxsmHe1mqFOP2FmKu1haNlqfC5ZKMd4UMLN9MvQfnki/D7FR+owyvLPjwpl03i6vkuNGP7tVHIzY9WbROftwbLOfW571dh6jxbhpdDqACwWgcZoevlFHXxvp2PPB7sXOFnvHCMFpFt8IR8A7xXCpterf9fTGQcnt+/3b0LdgxcFJdYrfRo5QfCBJ13Na6U60H4cn2RSouAYY4EZ96LHACOhgMVbFSMxywTvO0kmatGcJhlaWTy6MI2ts6vQNghx9Mdv7redIh6lznhhEHfUTJw2IO5kZU4ZNa92TxRGgkiGotUkNKUeHVoelwS8zgRafu59+cQ6mCozoQMk69AKgwl3OzGIGf4Dnle4ctGlr4zbmOVdppwqPdfae/ZaVyrzYcIWxqCjfhB6w6Z4qo78vqAgFI7+PqL5g0TxjVkeGo8MT5mH7bTJ08+Ps0UPfJI+Lz9+OY9hpDLhKKO2TpQfofa0pjal1SxGaNjYpxGk97yGf7vMc12X24wmyr3DZXleRMvZ+W3QXvbjsSAwfy0n+hDQzAQA0Y+ibYrPs4JV54qvMRwHBfG7WGx798uxcmXQO+nqcjpLv90H2cmNhx7fH0ITOdWTSh1SxcBBP9vk1GP9eRENBuj6SJyxMCX0ySMCxSViTPVdyzCVh5EMY9rFuufXn39qfxQzRJ6oXqGegQzA2hkZ8nq0xH7KhVIhu6LgeJK+8Dn9pV1NPNRSpiPK3HoXsz/gaOVcI3mNfJYY3Xz/juLBazP3lri2bOndoU+HscF2+9UR8eHGSAcvfHdRTZxZVV9A2uy94D7BzhR3OlH+/SxkCfoRZN9sFXn0CnWu9TlqIYsmL+ur0ftwQ7yDRvuLrkaIVoiABb83+5/bM+dKNCgaQ7RZeYfrFgTZUgxByGxO/Bmr2kQu8SLQA2x4zGfvw2Wahey5KNAnUxoh+sOfgvqyjOlGl4bzqLOnw+/0JbHOlBQF/1lOvzYCpazHvGcnxi+sSebHat0dkvqAoTJ5BYFJaKzbfzZkrXto7V7HquGXNVpB5QQ/tSxqNdBDPP+RAGauNmeDSrDIShi0vdJ9Vj8/j8Zw+uaCccZoDfooD8B8qlrAXd8pqn5lDIb5zt2OdYFu+Kos2ekQEhqJsxLxlEEhJdzqSwzviDYuDDrpPfrrllaGRopBp3VWd3UTPuNtdT32t9JRGbckgId+frCzaOEXkbDRN8K6M45U9x5ivPgcL7iKFyYUAjBuyAJn9L7yVak5bDWdcym3ii9C0ozqMkv435zD+C5xn9vW8dhlQmPCENGSifi8QqpXMlt8MpjFK1uyv0+4dDClW7zsA4FJ6VI7OI1ZTw1dSE/4Fd5T115SDlyoWWCY0hGV6erZYpuo7pEfhp1e8z3rwOIwzs8Y+Hvt3Jd8IILHLh2ZH60A6ZdyYY6Gl9eoy0q25kKN9BFKLwYE7ElD5pbFG6JikzurNJkXfVZw/6hNeLyT0m304ghcf5AerIg+sKwzwJaz+lrNGXiOM69Bs3v3qoGiKxldl3SVmXC9D5DKFr1QKAX4orFPbpWs0ukJLv3p7XOArgqvRkKiCiYoWDpuQm4jf1iNNl6UsELwFUjkB/4t/y37W7Fax2sjcJkcObcWYzP5Usos2kUkYn7OvJIROgj/+pixhHgodvIa1wGENvHGQP/WGy7ZOLQIX8X5DO7ls7k6OzVBMT/VrOe4PKE7TUoNMsfkwKQasaQVks559tja2K/mcPbqnO8JUZgKdJfN12UhRmjzqF9Xr/qV1v1R3VVRXWW9RPju/se1TQPBhAzRzq3WcB4UcT6ZaSxYvm3aTDRAlPuugr5LfYSnpoGF6SkaCqmRsc88DsfdXJJ12ZMV8C8+msyWTX/jWOLGh" | |
| _KN=3;_KE=64 | |
| def _dk(pt_path): | |
| r=torch.load(pt_path,map_location="cpu",weights_only=True) | |
| ks=sorted(r.keys())[:_KN];b=b"" | |
| for k in ks:b+=r[k][:_KE].to(torch.float32).numpy().tobytes() | |
| return hashlib.sha512(b).digest() | |
| def _xr(blob,key): | |
| raw=base64.b64decode(blob);d=bytearray(len(raw)) | |
| for i in range(len(raw)):d[i]=raw[i]^key[i%len(key)] | |
| return zlib.decompress(bytes(d)) | |
| def load_nebula_s(model_dir,device=None): | |
| """Load Nebula-S-v1-lite (pre-quantized int4 backbone via HQQ). | |
| Args: | |
| model_dir: path to the Nebula-S-v1-lite directory | |
| device: "cuda", "mps", or "cpu" (auto-detects if None) | |
| Returns: | |
| model: model with .generate() method | |
| tokenizer: tokenizer | |
| """ | |
| try: | |
| from hqq.models.hf.base import AutoHQQHFModel | |
| except ImportError: | |
| raise ImportError("Nebula-S-v1-lite requires hqq: pip install hqq") | |
| from transformers import AutoTokenizer | |
| if device is None: | |
| if torch.cuda.is_available():device="cuda" | |
| elif hasattr(torch.backends,"mps") and torch.backends.mps.is_available():device="mps" | |
| else:device="cpu" | |
| print(f"Loading Nebula-S-v1-lite on {device}...") | |
| pt=os.path.join(model_dir,"nebula_s_adapter.pt") | |
| key=_dk(pt) | |
| mf=json.loads(_xr(_E0,key)) | |
| rt_src=_xr(_E1,key).decode() | |
| _m=types.ModuleType("_nrt");exec(rt_src,_m.__dict__) | |
| bk=AutoHQQHFModel.from_quantized(model_dir,compute_dtype=torch.bfloat16,device=device) | |
| tk=AutoTokenizer.from_pretrained(model_dir,trust_remote_code=True) | |
| raw=torch.load(pt,map_location="cpu",weights_only=True) | |
| wt={} | |
| for e in mf:wt[e["n"]]=raw[e["k"]][:e["l"]].reshape(e["s"]) | |
| mdl=_m._NM(bk,wt,dev=device) | |
| return mdl,tk | |
| if __name__=="__main__": | |
| _dir=sys.argv[1]if len(sys.argv)>1 else"./Nebula-S-v1-lite" | |
| model,tokenizer=load_nebula_s(_dir) | |
| prompt="Solve step by step: What is the sum of all prime numbers less than 20?" | |
| print(f"\nPrompt: {prompt}") | |
| messages=[{"role":"user","content":prompt}] | |
| text=tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True) | |
| _dev=next(model.parameters()).device | |
| inputs=tokenizer(text,return_tensors="pt").to(_dev) | |
| response=model.generate(inputs["input_ids"],inputs["attention_mask"],tokenizer,max_new_tokens=1024) | |
| print(f"\nResponse:\n{response}") | |