czyzi0/the-mc-speech-dataset
Viewer • Updated • 24k • 383 • 8
How to use salihfurkaan/VoxPolska-Auralis with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-to-speech", model="salihfurkaan/VoxPolska-Auralis") # Load model directly
from transformers import AutoTokenizer, AutoModelForMultimodalLM
tokenizer = AutoTokenizer.from_pretrained("salihfurkaan/VoxPolska-Auralis")
model = AutoModelForMultimodalLM.from_pretrained("salihfurkaan/VoxPolska-Auralis")How to use salihfurkaan/VoxPolska-Auralis with Unsloth Studio:
curl -fsSL https://unsloth.ai/install.sh | sh # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for salihfurkaan/VoxPolska-Auralis to start chatting
irm https://unsloth.ai/install.ps1 | iex # Run unsloth studio unsloth studio -H 0.0.0.0 -p 8888 # Then open http://localhost:8888 in your browser # Search for salihfurkaan/VoxPolska-Auralis to start chatting
# No setup required # Open https://huggingface.co/spaces/unsloth/studio in your browser # Search for salihfurkaan/VoxPolska-Auralis to start chatting
pip install unsloth
from unsloth import FastModel
model, tokenizer = FastModel.from_pretrained(
model_name="salihfurkaan/VoxPolska-Auralis",
max_seq_length=2048,
)Configuration Parsing Warning:Config file tokenizer_config.json cannot be fetched (too big)
!pip install transformers ipython
from transformers import pipeline
from IPython.display import Audio
pipe = pipeline("text-to-speech", model="salihfurkaan/VoxPolska-Auralis")
output = pipe("Cześć, jestem modelem sztucznej inteligencji mówiącym po polsku")
Audio(output["audio"], rate=output["sampling_rate"])
!pip install --no-deps unsloth==2025.4.1 bitsandbytes unsloth_zoo trl==0.15.2
!pip install xcodec2==0.1.5 --no-deps
!pip install vector_quantize_pytorch
from unsloth import FastLanguageModel
import torch
from xcodec2.modeling_xcodec2 import XCodec2Model
import torchaudio
import soundfile as sf
from IPython.display import display, Audio
from transformers import AutoTokenizer, AutoModelForCausalLM
input_text = "Cześć, jestem modelem sztucznej inteligencji mówiącym po polsku."
XCODEC2_MODEL_NAME = "HKUST-Audio/xcodec2"
SAMPLE_RATE = 16000
device = "cuda" if torch.cuda.is_available() else "cpu"
codec_model = XCodec2Model.from_pretrained(XCODEC2_MODEL_NAME)
codec_model = codec_model.to(device).eval()
codec_model.to('cpu')
tokenizer = AutoTokenizer.from_pretrained("salihfurkaan/VoxPolska-Auralis")
model = AutoModelForCausalLM.from_pretrained("salihfurkaan/VoxPolska-Auralis")
FastLanguageModel.for_inference(model)
def ids_to_speech_tokens(speech_ids):
speech_tokens_str = []
for speech_id in speech_ids:
speech_tokens_str.append(f"<|s_{speech_id}|>")
return speech_tokens_str
def extract_speech_ids(speech_tokens_str):
speech_ids = []
for token_str in speech_tokens_str:
if token_str.startswith('<|s_') and token_str.endswith('|>'):
num_str = token_str[4:-2]
num = int(num_str)
speech_ids.append(num)
else:
print(f"Unexpected token: {token_str}")
return speech_ids
with torch.inference_mode():
with torch.amp.autocast(device,dtype=model.dtype):
formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
chat = [
{"role": "user", "content": "Convert the text to speech:" + formatted_text},
{"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
]
input_ids = tokenizer.apply_chat_template(
chat,
tokenize=True,
return_tensors='pt',
continue_final_message=True
)
speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
# Generate the speech autoregressively
outputs = model.generate(
input_ids,
max_length=2048,
eos_token_id= speech_end_id ,
do_sample=True,
top_p=1.2, # Adjusts the diversity of generated content
temperature=1.2, # Controls randomness in output
)
generated_ids = outputs[0][input_ids.shape[1]:-1]
speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
speech_tokens = extract_speech_ids(speech_tokens)
speech_tokens = torch.tensor(speech_tokens).cpu().unsqueeze(0).unsqueeze(0)
gen_wav = codec_model.decode_code(speech_tokens)
sf.write("output.wav", gen_wav[0, 0, :].cpu().numpy(), 16000)
display(Audio(gen_wav[0, 0, :].cpu().numpy(), rate=16000))
You can get your huggingface token from here
For questions, suggestions, and feedback, please open an issue on HuggingFace. You can also reach the author via: LinkedIn
Do not use this model for impersonation without consent, misinformation or deception (including fake news or fraudulent calls), or any illegal or harmful activity. By using this model, you agree to follow all applicable laws and ethical guidelines.
@misc{
title={salihfurkaan/VoxPolska-Auralis},
author={Salih Furkan Erik},
year={2025},
url={https://huggingface.co/salihfurkaan/VoxPolska-Auralis/}
}
Base model
meta-llama/Llama-3.2-1B-Instruct