How to use from
vLLM
Install from pip and serve model
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "yujiepan/qwen2-audio-tiny-random"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "yujiepan/qwen2-audio-tiny-random",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'
Use Docker
docker model run hf.co/yujiepan/qwen2-audio-tiny-random
Quick Links

This model is for debugging. It is randomly initialized using the config from Qwen/Qwen2-Audio-7B-Instruct but with smaller size.

Codes:

import os
from typing import Dict

import requests
import torch
import transformers
from PIL import Image
from torchvision import io
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor,
                          AutoTokenizer, GenerationConfig,
                          Qwen2AudioForConditionalGeneration, pipeline,
                          set_seed)

model_id = "Qwen/Qwen2-Audio-7B-Instruct"
repo_id = "yujiepan/qwen2-audio-tiny-random"
save_path = f"/tmp/{repo_id}"

config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
config.audio_config.encoder_layers = 2
config.audio_config.encoder_attention_heads = 2
config.audio_config.encoder_ffn_dim = 32
config.audio_config.d_model = 16
config.text_config.num_hidden_layers = 2
config.text_config.intermediate_size = 32
config.text_config.hidden_size = 16
config.text_config.num_attention_heads = 2
config.text_config.num_key_value_heads = 1

model = Qwen2AudioForConditionalGeneration(config=config)
model = model.to(torch.bfloat16).cuda().eval()
model.generation_config = GenerationConfig.from_pretrained(
    model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
    for _, p in sorted(model.named_parameters()):
        torch.nn.init.uniform_(p, -0.3, 0.3)

processor = AutoProcessor.from_pretrained(model_id)
model.save_pretrained(save_path)
processor.save_pretrained(save_path)
os.system(f"ls -alh {save_path}")


def try_inference():
    from io import BytesIO
    from urllib.request import urlopen

    import librosa
    processor = AutoProcessor.from_pretrained(save_path)
    model = Qwen2AudioForConditionalGeneration.from_pretrained(
        save_path, device_map="auto")
    conversation = [
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
        ]},
        {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
        {"role": "user", "content": [
            {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
        ]},
    ]
    text = processor.apply_chat_template(
        conversation, add_generation_prompt=True, tokenize=False)
    audios = []
    for message in conversation:
        if isinstance(message["content"], list):
            for ele in message["content"]:
                if ele["type"] == "audio":
                    audios.append(librosa.load(
                        BytesIO(urlopen(ele['audio_url']).read()),
                        sr=processor.feature_extractor.sampling_rate)[0]
                    )

    inputs = processor(text=text, audios=audios,
                       return_tensors="pt", padding=True)
    inputs.input_ids = inputs.input_ids.to("cuda")

    generate_ids = model.generate(**inputs, max_length=256)
    generate_ids = generate_ids[:, inputs.input_ids.size(1):]

    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print(response)


try_inference()
Downloads last month
43
Safetensors
Model size
5.03M params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Collection including yujiepan/qwen2-audio-tiny-random