Instructions to use xlr8harder/talkie-1930-13b-base-tf with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use xlr8harder/talkie-1930-13b-base-tf with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="xlr8harder/talkie-1930-13b-base-tf", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("xlr8harder/talkie-1930-13b-base-tf", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use xlr8harder/talkie-1930-13b-base-tf with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "xlr8harder/talkie-1930-13b-base-tf" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "xlr8harder/talkie-1930-13b-base-tf", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/xlr8harder/talkie-1930-13b-base-tf
- SGLang
How to use xlr8harder/talkie-1930-13b-base-tf with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "xlr8harder/talkie-1930-13b-base-tf" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "xlr8harder/talkie-1930-13b-base-tf", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "xlr8harder/talkie-1930-13b-base-tf" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "xlr8harder/talkie-1930-13b-base-tf", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use xlr8harder/talkie-1930-13b-base-tf with Docker Model Runner:
docker model run hf.co/xlr8harder/talkie-1930-13b-base-tf
| from __future__ import annotations | |
| from collections.abc import Mapping | |
| from transformers import PretrainedConfig | |
| class TalkieConfig(PretrainedConfig): | |
| model_type = "talkie" | |
| def __init__( | |
| self, | |
| vocab_size: int = 65536, | |
| n_layer: int = 40, | |
| n_head: int = 40, | |
| n_embd: int = 5120, | |
| head_dim: int = 128, | |
| max_position_embeddings: int = 2048, | |
| rope_base: int = 1_000_000, | |
| rope_scaling: dict | None = None, | |
| rope_parameters: dict | None = None, | |
| logit_scale: float = 1.0, | |
| use_cache: bool = True, | |
| tie_word_embeddings: bool = False, | |
| bos_token_id: int | None = None, | |
| eos_token_id: int | list[int] = 65535, | |
| pad_token_id: int | None = None, | |
| **kwargs, | |
| ): | |
| if rope_scaling is None: | |
| rope_scaling = rope_parameters | |
| self.max_position_embeddings = max_position_embeddings | |
| self.rope_scaling = self._normalize_rope_scaling(rope_scaling) | |
| self.rope_parameters = self.rope_scaling | |
| super().__init__( | |
| bos_token_id=bos_token_id, | |
| eos_token_id=eos_token_id, | |
| pad_token_id=pad_token_id, | |
| tie_word_embeddings=tie_word_embeddings, | |
| **kwargs, | |
| ) | |
| self.vocab_size = vocab_size | |
| self.n_layer = n_layer | |
| self.n_head = n_head | |
| self.n_embd = n_embd | |
| self.head_dim = head_dim | |
| self.max_position_embeddings = max_position_embeddings | |
| self.rope_base = rope_base | |
| self.rope_scaling = self._normalize_rope_scaling(rope_scaling) | |
| self.rope_parameters = self.rope_scaling | |
| self.logit_scale = logit_scale | |
| self.use_cache = use_cache | |
| # Common Transformers aliases used by generation/cache helpers. | |
| self.hidden_size = n_embd | |
| self.num_hidden_layers = n_layer | |
| self.num_attention_heads = n_head | |
| def _normalize_rope_scaling(rope_scaling: dict | None) -> dict | None: | |
| if rope_scaling is None: | |
| return None | |
| if not isinstance(rope_scaling, Mapping): | |
| raise TypeError("rope_scaling must be a dictionary") | |
| scaling = dict(rope_scaling) | |
| rope_type = scaling.get("rope_type", scaling.get("type")) | |
| if rope_type is None: | |
| raise ValueError("rope_scaling must include 'rope_type' or 'type'") | |
| rope_type = str(rope_type).lower() | |
| if rope_type == "ntk": | |
| rope_type = "dynamic" | |
| supported = {"default", "linear", "dynamic", "yarn"} | |
| if rope_type not in supported: | |
| raise ValueError( | |
| f"unsupported rope_scaling type {rope_type!r}; expected one of {sorted(supported)}" | |
| ) | |
| if rope_type == "default": | |
| return None | |
| factor = float(scaling.get("factor", 1.0)) | |
| if factor < 1.0: | |
| raise ValueError("rope_scaling factor must be >= 1.0") | |
| scaling["rope_type"] = rope_type | |
| scaling.pop("type", None) | |
| scaling["factor"] = factor | |
| if "original_max_position_embeddings" in scaling: | |
| scaling["original_max_position_embeddings"] = int( | |
| scaling["original_max_position_embeddings"] | |
| ) | |
| if "beta_fast" in scaling: | |
| scaling["beta_fast"] = float(scaling["beta_fast"]) | |
| if "beta_slow" in scaling: | |
| scaling["beta_slow"] = float(scaling["beta_slow"]) | |
| if "attention_factor" in scaling and scaling["attention_factor"] is not None: | |
| scaling["attention_factor"] = float(scaling["attention_factor"]) | |
| return scaling | |