Instructions to use xlr8harder/talkie-1930-13b-base-tf with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use xlr8harder/talkie-1930-13b-base-tf with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="xlr8harder/talkie-1930-13b-base-tf", trust_remote_code=True)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("xlr8harder/talkie-1930-13b-base-tf", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use xlr8harder/talkie-1930-13b-base-tf with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "xlr8harder/talkie-1930-13b-base-tf"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xlr8harder/talkie-1930-13b-base-tf",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker

docker model run hf.co/xlr8harder/talkie-1930-13b-base-tf

SGLang

How to use xlr8harder/talkie-1930-13b-base-tf with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "xlr8harder/talkie-1930-13b-base-tf" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xlr8harder/talkie-1930-13b-base-tf",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "xlr8harder/talkie-1930-13b-base-tf" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "xlr8harder/talkie-1930-13b-base-tf",
		"prompt": "Once upon a time,",
		"max_tokens": 512,
		"temperature": 0.5
	}'

Docker Model Runner
How to use xlr8harder/talkie-1930-13b-base-tf with Docker Model Runner:
```
docker model run hf.co/xlr8harder/talkie-1930-13b-base-tf
```

talkie-1930-13b-base-tf / configuration_talkie.py

xlr8harder

Fix vLLM CUDA graph capture in forward path

294862a verified about 2 months ago

Raw

History Blame Contribute Delete

3.64 kB

	from __future__ import annotations

	from collections.abc import Mapping

	from transformers import PretrainedConfig


	class TalkieConfig(PretrainedConfig):
	model_type = "talkie"

	def __init__(
	self,
	vocab_size: int = 65536,
	n_layer: int = 40,
	n_head: int = 40,
	n_embd: int = 5120,
	head_dim: int = 128,
	max_position_embeddings: int = 2048,
	rope_base: int = 1_000_000,
	rope_scaling: dict \| None = None,
	rope_parameters: dict \| None = None,
	logit_scale: float = 1.0,
	use_cache: bool = True,
	tie_word_embeddings: bool = False,
	bos_token_id: int \| None = None,
	eos_token_id: int \| list[int] = 65535,
	pad_token_id: int \| None = None,
	**kwargs,
	):
	if rope_scaling is None:
	rope_scaling = rope_parameters
	self.max_position_embeddings = max_position_embeddings
	self.rope_scaling = self._normalize_rope_scaling(rope_scaling)
	self.rope_parameters = self.rope_scaling
	super().__init__(
	bos_token_id=bos_token_id,
	eos_token_id=eos_token_id,
	pad_token_id=pad_token_id,
	tie_word_embeddings=tie_word_embeddings,
	**kwargs,
	)
	self.vocab_size = vocab_size
	self.n_layer = n_layer
	self.n_head = n_head
	self.n_embd = n_embd
	self.head_dim = head_dim
	self.max_position_embeddings = max_position_embeddings
	self.rope_base = rope_base
	self.rope_scaling = self._normalize_rope_scaling(rope_scaling)
	self.rope_parameters = self.rope_scaling
	self.logit_scale = logit_scale
	self.use_cache = use_cache

	# Common Transformers aliases used by generation/cache helpers.
	self.hidden_size = n_embd
	self.num_hidden_layers = n_layer
	self.num_attention_heads = n_head

	@staticmethod
	def _normalize_rope_scaling(rope_scaling: dict \| None) -> dict \| None:
	if rope_scaling is None:
	return None
	if not isinstance(rope_scaling, Mapping):
	raise TypeError("rope_scaling must be a dictionary")

	scaling = dict(rope_scaling)
	rope_type = scaling.get("rope_type", scaling.get("type"))
	if rope_type is None:
	raise ValueError("rope_scaling must include 'rope_type' or 'type'")

	rope_type = str(rope_type).lower()
	if rope_type == "ntk":
	rope_type = "dynamic"
	supported = {"default", "linear", "dynamic", "yarn"}
	if rope_type not in supported:
	raise ValueError(
	f"unsupported rope_scaling type {rope_type!r}; expected one of {sorted(supported)}"
	)

	if rope_type == "default":
	return None

	factor = float(scaling.get("factor", 1.0))
	if factor < 1.0:
	raise ValueError("rope_scaling factor must be >= 1.0")

	scaling["rope_type"] = rope_type
	scaling.pop("type", None)
	scaling["factor"] = factor
	if "original_max_position_embeddings" in scaling:
	scaling["original_max_position_embeddings"] = int(
	scaling["original_max_position_embeddings"]
	)
	if "beta_fast" in scaling:
	scaling["beta_fast"] = float(scaling["beta_fast"])
	if "beta_slow" in scaling:
	scaling["beta_slow"] = float(scaling["beta_slow"])
	if "attention_factor" in scaling and scaling["attention_factor"] is not None:
	scaling["attention_factor"] = float(scaling["attention_factor"])
	return scaling