slm-bahasa-id / model.py
romizone's picture
Upload SLM Bahasa Indonesia
9815efc verified
Raw
History Blame
13 kB
"""
Small Language Model (SLM) - Transformer from Scratch
======================================================
Arsitektur Transformer decoder-only (GPT-style) untuk Bahasa Indonesia.
Dibangun dari nol menggunakan PyTorch.
Author: Jekardah AI Lab
"""
import math
import json
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass, asdict
from typing import Optional
@dataclass
class SLMConfig:
"""Konfigurasi model SLM."""
vocab_size: int = 32000
embed_dim: int = 256
num_heads: int = 4
num_layers: int = 4
ffn_dim: int = 512
max_seq_len: int = 128
dropout: float = 0.1
layer_norm_eps: float = 1e-5
def save(self, path: str):
with open(path, "w") as f:
json.dump(asdict(self), f, indent=2)
@classmethod
def load(cls, path: str) -> "SLMConfig":
with open(path, "r") as f:
return cls(**json.load(f))
class RMSNorm(nn.Module):
"""Root Mean Square Layer Normalization (lebih efisien dari LayerNorm)."""
def __init__(self, dim: int, eps: float = 1e-5):
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.ones(dim))
def forward(self, x: torch.Tensor) -> torch.Tensor:
rms = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
return x * rms * self.weight
class RotaryPositionalEncoding(nn.Module):
"""
Rotary Position Embedding (RoPE).
Teknik modern yang dipakai LLaMA, Qwen, dll.
"""
def __init__(self, dim: int, max_seq_len: int = 128, base: float = 10000.0):
super().__init__()
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer("inv_freq", inv_freq)
# Precompute cos/sin
t = torch.arange(max_seq_len).float()
freqs = torch.outer(t, inv_freq)
cos_cached = freqs.cos()
sin_cached = freqs.sin()
self.register_buffer("cos_cached", cos_cached)
self.register_buffer("sin_cached", sin_cached)
def forward(self, seq_len: int):
return self.cos_cached[:seq_len], self.sin_cached[:seq_len]
def apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
"""Apply rotary embeddings to input tensor."""
# x shape: (batch, heads, seq_len, head_dim)
head_dim = x.shape[-1]
x1 = x[..., : head_dim // 2]
x2 = x[..., head_dim // 2:]
cos = cos[:x.shape[2]].unsqueeze(0).unsqueeze(0) # (1, 1, seq, dim/2)
sin = sin[:x.shape[2]].unsqueeze(0).unsqueeze(0)
rotated = torch.cat((-x2, x1), dim=-1)
x_rope = x * torch.cat((cos, cos), dim=-1) + rotated * torch.cat((sin, sin), dim=-1)
return x_rope
class MultiHeadSelfAttention(nn.Module):
"""
Multi-Head Self Attention dengan causal mask.
Setiap token hanya bisa "melihat" token sebelumnya (autoregressive).
"""
def __init__(self, config: SLMConfig):
super().__init__()
self.num_heads = config.num_heads
self.head_dim = config.embed_dim // config.num_heads
self.embed_dim = config.embed_dim
assert config.embed_dim % config.num_heads == 0, \
"embed_dim harus bisa dibagi num_heads"
# Q, K, V projections
self.q_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
self.k_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
self.v_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
# Output projection
self.out_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
# RoPE
self.rope = RotaryPositionalEncoding(self.head_dim, config.max_seq_len)
# Dropout
self.attn_dropout = nn.Dropout(config.dropout)
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
batch_size, seq_len, _ = x.shape
# Project to Q, K, V
q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
k = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
v = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
# Apply RoPE
cos, sin = self.rope(seq_len)
q = apply_rotary_emb(q, cos, sin)
k = apply_rotary_emb(k, cos, sin)
# Scaled dot-product attention
scale = math.sqrt(self.head_dim)
attn_weights = torch.matmul(q, k.transpose(-2, -1)) / scale
# Causal mask (prevent looking at future tokens)
if mask is None:
mask = torch.triu(
torch.ones(seq_len, seq_len, device=x.device, dtype=torch.bool),
diagonal=1
)
attn_weights = attn_weights.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
# Softmax + dropout
attn_weights = F.softmax(attn_weights, dim=-1)
attn_weights = self.attn_dropout(attn_weights)
# Apply attention to values
output = torch.matmul(attn_weights, v)
# Reshape and project output
output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
output = self.out_proj(output)
return output
class FeedForward(nn.Module):
"""
Feed-Forward Network dengan SwiGLU activation.
Teknik modern yang dipakai LLaMA, Mistral, dll.
"""
def __init__(self, config: SLMConfig):
super().__init__()
self.gate_proj = nn.Linear(config.embed_dim, config.ffn_dim, bias=False)
self.up_proj = nn.Linear(config.embed_dim, config.ffn_dim, bias=False)
self.down_proj = nn.Linear(config.ffn_dim, config.embed_dim, bias=False)
self.dropout = nn.Dropout(config.dropout)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# SwiGLU: swish(gate) * up
gate = F.silu(self.gate_proj(x))
up = self.up_proj(x)
x = gate * up
x = self.down_proj(x)
x = self.dropout(x)
return x
class TransformerBlock(nn.Module):
"""
Satu block Transformer: Attention → FFN, dengan RMSNorm + residual.
"""
def __init__(self, config: SLMConfig):
super().__init__()
self.attention = MultiHeadSelfAttention(config)
self.feed_forward = FeedForward(config)
self.attn_norm = RMSNorm(config.embed_dim, config.layer_norm_eps)
self.ffn_norm = RMSNorm(config.embed_dim, config.layer_norm_eps)
def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
# Pre-norm attention + residual
x = x + self.attention(self.attn_norm(x), mask)
# Pre-norm FFN + residual
x = x + self.feed_forward(self.ffn_norm(x))
return x
class SmallLM(nn.Module):
"""
Small Language Model (SLM) - GPT-style Transformer.
Arsitektur:
Token Embedding + RoPE
→ N × TransformerBlock (Attention + FFN)
→ RMSNorm
→ Output Linear (predict next token)
"""
def __init__(self, config: SLMConfig):
super().__init__()
self.config = config
# Token embedding
self.token_embedding = nn.Embedding(config.vocab_size, config.embed_dim)
# Transformer blocks
self.layers = nn.ModuleList([
TransformerBlock(config) for _ in range(config.num_layers)
])
# Final norm
self.norm = RMSNorm(config.embed_dim, config.layer_norm_eps)
# Output head (predict next token)
self.lm_head = nn.Linear(config.embed_dim, config.vocab_size, bias=False)
# Weight tying (embedding weights = output weights)
self.lm_head.weight = self.token_embedding.weight
# Initialize weights
self.apply(self._init_weights)
def _init_weights(self, module):
"""Xavier/Kaiming initialization."""
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
"""
Forward pass.
Args:
input_ids: Token IDs, shape (batch, seq_len)
Returns:
Logits, shape (batch, seq_len, vocab_size)
"""
# Token embedding
x = self.token_embedding(input_ids)
# Causal mask
seq_len = input_ids.shape[1]
mask = torch.triu(
torch.ones(seq_len, seq_len, device=input_ids.device, dtype=torch.bool),
diagonal=1
)
# Transformer blocks
for layer in self.layers:
x = layer(x, mask)
# Final norm + project to vocab
x = self.norm(x)
logits = self.lm_head(x)
return logits
def count_parameters(self) -> int:
"""Count total trainable parameters."""
return sum(p.numel() for p in self.parameters() if p.requires_grad)
@torch.no_grad()
def generate(
self,
input_ids: torch.Tensor,
max_new_tokens: int = 50,
temperature: float = 0.8,
top_k: int = 40,
top_p: float = 0.9,
) -> torch.Tensor:
"""
Autoregressive text generation.
Args:
input_ids: Starting token IDs, shape (1, seq_len)
max_new_tokens: Maximum tokens to generate
temperature: Sampling temperature (lower = more deterministic)
top_k: Top-k sampling
top_p: Nucleus (top-p) sampling
"""
self.eval()
for _ in range(max_new_tokens):
# Crop to max context length
idx_cond = input_ids[:, -self.config.max_seq_len:]
# Forward pass
logits = self(idx_cond)
logits = logits[:, -1, :] / temperature
# Top-k filtering
if top_k > 0:
top_k_values, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < top_k_values[:, [-1]]] = float('-inf')
# Top-p (nucleus) filtering
if top_p < 1.0:
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
sorted_mask = cumulative_probs - F.softmax(sorted_logits, dim=-1) >= top_p
sorted_logits[sorted_mask] = float('-inf')
logits = sorted_logits.scatter(1, sorted_indices, sorted_logits)
# Sample
probs = F.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
# Append
input_ids = torch.cat([input_ids, next_token], dim=1)
# Stop on EOS
if next_token.item() == 3: # <EOS> token
break
return input_ids
def save_pretrained(self, directory: str):
"""Save model weights and config."""
os.makedirs(directory, exist_ok=True)
# Save config
self.config.save(os.path.join(directory, "config.json"))
# Save weights as safetensors
from safetensors.torch import save_file
# Exclude lm_head.weight since it's tied to token_embedding.weight
state_dict = {}
for k, v in self.state_dict().items():
if k != "lm_head.weight": # skip tied weight
state_dict[k] = v
save_file(state_dict, os.path.join(directory, "model.safetensors"))
print(f"💾 Model saved to: {directory}")
@classmethod
def from_pretrained(cls, directory: str, device: str = "cpu") -> "SmallLM":
"""Load model from directory."""
config = SLMConfig.load(os.path.join(directory, "config.json"))
model = cls(config)
from safetensors.torch import load_file
state_dict = load_file(os.path.join(directory, "model.safetensors"))
# Restore tied weight
if "lm_head.weight" not in state_dict and "token_embedding.weight" in state_dict:
state_dict["lm_head.weight"] = state_dict["token_embedding.weight"]
model.load_state_dict(state_dict)
model.to(device)
model.eval()
print(f"✅ Model loaded from: {directory}")
print(f" Parameters: {model.count_parameters():,}")
return model
if __name__ == "__main__":
# Quick test
config = SLMConfig()
model = SmallLM(config)
print(f"🧠 SmallLM Architecture")
print(f" Parameters: {model.count_parameters():,}")
print(f" Config: {config}")
# Test forward pass
dummy_input = torch.randint(0, config.vocab_size, (2, 32))
logits = model(dummy_input)
print(f"\n Input shape: {dummy_input.shape}")
print(f" Output shape: {logits.shape}")
print(f" ✅ Forward pass works!")