Instructions to use royleibov/granite-3b-code-base-128k-ZipNN-Compressed with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use royleibov/granite-3b-code-base-128k-ZipNN-Compressed with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="royleibov/granite-3b-code-base-128k-ZipNN-Compressed")

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("royleibov/granite-3b-code-base-128k-ZipNN-Compressed")
model = AutoModelForCausalLM.from_pretrained("royleibov/granite-3b-code-base-128k-ZipNN-Compressed")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use royleibov/granite-3b-code-base-128k-ZipNN-Compressed with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "royleibov/granite-3b-code-base-128k-ZipNN-Compressed"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "royleibov/granite-3b-code-base-128k-ZipNN-Compressed",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/royleibov/granite-3b-code-base-128k-ZipNN-Compressed

SGLang

How to use royleibov/granite-3b-code-base-128k-ZipNN-Compressed with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "royleibov/granite-3b-code-base-128k-ZipNN-Compressed" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "royleibov/granite-3b-code-base-128k-ZipNN-Compressed",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "royleibov/granite-3b-code-base-128k-ZipNN-Compressed" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "royleibov/granite-3b-code-base-128k-ZipNN-Compressed",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use royleibov/granite-3b-code-base-128k-ZipNN-Compressed with Docker Model Runner:
```
docker model run hf.co/royleibov/granite-3b-code-base-128k-ZipNN-Compressed
```

granite-3b-code-base-128k-ZipNN-Compressed / zipnn_compress_file.py

royleibov

Add .znn files

cc8acc6 over 1 year ago

raw

history blame contribute delete

5.49 kB

	import os
	import subprocess
	import sys
	import argparse
	import time

	sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

	KB = 1024
	MB = 1024 * 1024
	GB = 1024 * 1024 * 1024

	RED = "\033[91m"
	YELLOW = "\033[93m"
	GREEN = "\033[92m"
	RESET = "\033[0m"

	def check_and_install_zipnn():
	try:
	import zipnn
	except ImportError:
	print("zipnn not found. Installing...")
	subprocess.check_call(
	[
	sys.executable,
	"-m",
	"pip",
	"install",
	"zipnn",
	"--upgrade",
	]
	)
	import zipnn


	def parse_streaming_chunk_size(
	streaming_chunk_size,
	):
	if str(streaming_chunk_size).isdigit():
	final = int(streaming_chunk_size)
	else:
	size_value = int(streaming_chunk_size[:-2])
	size_unit = streaming_chunk_size[-2].lower()

	if size_unit == "k":
	final = KB * size_value
	elif size_unit == "m":
	final = MB * size_value
	elif size_unit == "g":
	final = GB * size_value
	else:
	raise ValueError(f"Invalid size unit: {size_unit}. Use 'k', 'm', or 'g'.")

	return final


	def compress_file(
	input_file,
	dtype="",
	streaming_chunk_size=1048576,
	delete=False,
	force=False,
	hf_cache=False,
	):
	import zipnn

	streaming_chunk_size = parse_streaming_chunk_size(streaming_chunk_size)
	full_path = input_file
	if not os.path.exists(full_path):
	print(f"{RED}File not found{RESET}")
	return

	compressed_path = full_path + ".znn"
	if not force and os.path.exists(compressed_path):
	user_input = (
	input(f"{compressed_path} already exists; overwrite (y/n)? ").strip().lower()
	)
	if user_input not in ("yes", "y"):
	print(f"Skipping {full_path}...")
	return
	print(f"Compressing {full_path}...")
	#
	output_file = input_file + ".znn"
	if dtype:
	zpn = zipnn.ZipNN(
	bytearray_dtype="float32",
	is_streaming=True,
	streaming_chunk_kb=streaming_chunk_size,
	)
	else:
	zpn = zipnn.ZipNN(
	is_streaming=True,
	streaming_chunk_kb=streaming_chunk_size,
	)
	file_size_before = 0
	file_size_after = 0
	start_time = time.time()
	with open(input_file, "rb") as infile, open(output_file, "wb") as outfile:
	chunk = infile.read()
	file_size_before += len(chunk)
	compressed_chunk = zpn.compress(chunk)
	if compressed_chunk:
	file_size_after += len(compressed_chunk)
	outfile.write(compressed_chunk)
	end_time = time.time() - start_time
	print(f"Compressed {input_file} to {output_file}")
	print(
	f"{GREEN}Original size: {file_size_before/GB:.02f}GB size after compression: {file_size_after/GB:.02f}GB, Remaining size is {file_size_after/file_size_before*100:.02f}% of original, time: {end_time:.02f}{RESET}"
	)

	if delete and not hf_cache:
	print(f"Deleting {full_path}...")
	os.remove(full_path)

	if hf_cache:
	# If the file is in the Hugging Face cache, fix the symlinks
	print(f"{YELLOW}Reorganizing Hugging Face cache...{RESET}")
	try:
	snapshot_path = os.path.dirname(input_file)
	blob_name = os.path.join(snapshot_path, os.readlink(input_file))
	os.rename(output_file, blob_name)
	os.symlink(blob_name, output_file)
	if os.path.exists(input_file):
	os.remove(input_file)
	except Exception as e:
	raise Exception(f"Error reorganizing Hugging Face cache: {e}")

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python compress_files.py <suffix>")
	print("Example: python compress_files.py 'safetensors'")
	sys.exit(1)

	parser = argparse.ArgumentParser(description="Enter a file path to compress.")
	parser.add_argument(
	"input_file",
	type=str,
	help="Specify the path to the file to compress.",
	)
	parser.add_argument(
	"--float32",
	action="store_true",
	help="A flag that triggers float32 compression",
	)
	parser.add_argument(
	"--streaming_chunk_size",
	type=str,
	help="An optional streaming chunk size. The format is int (for size in Bytes) or int+KB/MB/GB. Default is 1MB",
	)
	parser.add_argument(
	"--delete",
	action="store_true",
	help="A flag that triggers deletion of a single file instead of compression",
	)
	parser.add_argument(
	"--force",
	action="store_true",
	help="A flag that forces overwriting when compressing.",
	)
	parser.add_argument(
	"--hf_cache",
	action="store_true",
	help="A flag that indicates if the file is in the Hugging Face cache.",
	)
	args = parser.parse_args()
	optional_kwargs = {}
	if args.float32:
	optional_kwargs["dtype"] = 32
	if args.streaming_chunk_size is not None:
	optional_kwargs["streaming_chunk_size"] = args.streaming_chunk_size
	if args.delete:
	optional_kwargs["delete"] = args.delete
	if args.force:
	optional_kwargs["force"] = args.force
	if args.hf_cache:
	optional_kwargs["hf_cache"] = args.hf_cache

	check_and_install_zipnn()
	compress_file(args.input_file, **optional_kwargs)