Fredtt3 commited on
Commit
f2043c7
·
verified ·
1 Parent(s): 0023ce4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +141 -1
README.md CHANGED
@@ -22,4 +22,144 @@ tags:
22
  ---
23
  <h1 align="center">Athenea-4B-Thinking</h1>
24
 
25
- ![image](atheneamodel.png)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  ---
23
  <h1 align="center">Athenea-4B-Thinking</h1>
24
 
25
+ ![image](atheneamodel.png)
26
+
27
+ ## 💻 Usage
28
+
29
+ ### Installation
30
+
31
+ ```bash
32
+ uv pip install transformers torch accelerate
33
+ ```
34
+
35
+ ### Basic Inference
36
+
37
+ ```python
38
+ from transformers import AutoModelForCausalLM, AutoTokenizer
39
+ import torch
40
+
41
+ model = AutoModelForCausalLM.from_pretrained("Aquiles-ai/Athenea-4B-Thinking",
42
+ dtype=torch.bfloat16,
43
+ trust_remote_code=True,
44
+ device_map="auto",
45
+ attn_implementation="flash_attention_2") # Requires flash-attn
46
+
47
+ # Without flash-attn:
48
+ # model = AutoModelForCausalLM.from_pretrained("Aquiles-ai/Athenea-4B-Thinking",
49
+ # dtype="auto",
50
+ # device_map="auto"
51
+ # )
52
+
53
+ tokenizer = AutoTokenizer.from_pretrained("Aquiles-ai/Athenea-4B-Thinking", trust_remote_code=True)
54
+ messages = [
55
+ {"role": "user", "content": "Hey, explain to me in simple terms how reinforcement learning works."}
56
+ ]
57
+
58
+ inputs = tokenizer.apply_chat_template(
59
+ messages,
60
+ add_generation_prompt=True,
61
+ tokenize=True,
62
+ return_dict=True,
63
+ return_tensors="pt",
64
+ ).to('cuda')
65
+
66
+ with torch.no_grad():
67
+ output = model.generate(
68
+ **inputs,
69
+ max_new_tokens=8092,
70
+ pad_token_id=tokenizer.eos_token_id,
71
+ eos_token_id=tokenizer.eos_token_id,
72
+ )
73
+
74
+ # Decode and print the output
75
+ print(tokenizer.decode(output[0], skip_special_tokens=True))
76
+ ```
77
+
78
+ ### Streaming Inference
79
+
80
+ ```python
81
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
82
+ import torch
83
+ from threading import Thread
84
+
85
+ model = AutoModelForCausalLM.from_pretrained("Aquiles-ai/Athenea-4B-Thinking",
86
+ dtype=torch.bfloat16,
87
+ trust_remote_code=True,
88
+ device_map="auto",
89
+ attn_implementation="flash_attention_2")
90
+
91
+ tokenizer = AutoTokenizer.from_pretrained("Aquiles-ai/Athenea-4B-Thinking", trust_remote_code=True)
92
+
93
+ messages = [
94
+ {"role": "user", "content": "Hey, explain the difference between artificial intelligence, machine learning, and deep learning."}
95
+ ]
96
+
97
+ inputs = tokenizer.apply_chat_template(
98
+ messages,
99
+ add_generation_prompt=True,
100
+ tokenize=True,
101
+ return_dict=True,
102
+ return_tensors="pt",
103
+ ).to('cuda')
104
+
105
+ # Create the streamer
106
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
107
+
108
+ # Build kwargs for generate
109
+ generate_kwargs = dict(
110
+ **inputs,
111
+ max_new_tokens=8092,
112
+ pad_token_id=tokenizer.eos_token_id,
113
+ eos_token_id=tokenizer.eos_token_id,
114
+ streamer=streamer,
115
+ )
116
+
117
+ def _generate_thread(model, kwargs):
118
+ with torch.no_grad():
119
+ model.generate(**kwargs)
120
+
121
+ thread = Thread(target=_generate_thread, args=(model, generate_kwargs))
122
+
123
+ thread.start()
124
+
125
+ for chunk in streamer:
126
+ print(chunk, end="", flush=True)
127
+ ```
128
+
129
+ ### Production Deployment with vLLM
130
+
131
+ **Start server:**
132
+
133
+ ```bash
134
+ vllm serve Aquiles-ai/Athenea-4B-Thinking \
135
+ --host 0.0.0.0 \
136
+ --port 8000 \
137
+ --api-key dummyapikey \
138
+ --max-model-len=16384 \
139
+ --async-scheduling \
140
+ --gpu-memory-utilization=0.90
141
+ ```
142
+
143
+ **Request to the server from the OpenAI client:**
144
+
145
+ ```python
146
+ from openai import OpenAI
147
+
148
+ client = OpenAI(api_key="dummyapikey", base_url="http://127.0.0.1:8000/v1")
149
+
150
+ stream = client.chat.completions.create(
151
+ model="Aquiles-ai/Athenea-4B-Thinking,
152
+ messages=[{
153
+ "role": "user",
154
+ "content": "Hey, tell me how a large language model like Llama or GPT is trained."
155
+ }],
156
+ max_tokens=8092,
157
+ stream=True
158
+ )
159
+
160
+ for chunk in stream:
161
+ if chunk.choices[0].delta.content:
162
+ print(chunk.choices[0].delta.content, end="", flush=True)
163
+ ```
164
+
165
+ **vLLM Benefits:** 20-30x faster inference, OpenAI-compatible API, continuous batching, async scheduling.