Upload SLM Bahasa Indonesia

9815efc verified 3 months ago

13 kB

	"""
	Small Language Model (SLM) - Transformer from Scratch
	======================================================
	Arsitektur Transformer decoder-only (GPT-style) untuk Bahasa Indonesia.
	Dibangun dari nol menggunakan PyTorch.

	Author: Jekardah AI Lab
	"""

	import math
	import json
	import os
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from dataclasses import dataclass, asdict
	from typing import Optional


	@dataclass
	class SLMConfig:
	"""Konfigurasi model SLM."""
	vocab_size: int = 32000
	embed_dim: int = 256
	num_heads: int = 4
	num_layers: int = 4
	ffn_dim: int = 512
	max_seq_len: int = 128
	dropout: float = 0.1
	layer_norm_eps: float = 1e-5

	def save(self, path: str):
	with open(path, "w") as f:
	json.dump(asdict(self), f, indent=2)

	@classmethod
	def load(cls, path: str) -> "SLMConfig":
	with open(path, "r") as f:
	return cls(**json.load(f))


	class RMSNorm(nn.Module):
	"""Root Mean Square Layer Normalization (lebih efisien dari LayerNorm)."""

	def __init__(self, dim: int, eps: float = 1e-5):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(dim))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	rms = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
	return x * rms * self.weight


	class RotaryPositionalEncoding(nn.Module):
	"""
	Rotary Position Embedding (RoPE).
	Teknik modern yang dipakai LLaMA, Qwen, dll.
	"""

	def __init__(self, dim: int, max_seq_len: int = 128, base: float = 10000.0):
	super().__init__()
	inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
	self.register_buffer("inv_freq", inv_freq)

	# Precompute cos/sin
	t = torch.arange(max_seq_len).float()
	freqs = torch.outer(t, inv_freq)
	cos_cached = freqs.cos()
	sin_cached = freqs.sin()
	self.register_buffer("cos_cached", cos_cached)
	self.register_buffer("sin_cached", sin_cached)

	def forward(self, seq_len: int):
	return self.cos_cached[:seq_len], self.sin_cached[:seq_len]


	def apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
	"""Apply rotary embeddings to input tensor."""
	# x shape: (batch, heads, seq_len, head_dim)
	head_dim = x.shape[-1]
	x1 = x[..., : head_dim // 2]
	x2 = x[..., head_dim // 2:]

	cos = cos[:x.shape[2]].unsqueeze(0).unsqueeze(0) # (1, 1, seq, dim/2)
	sin = sin[:x.shape[2]].unsqueeze(0).unsqueeze(0)

	rotated = torch.cat((-x2, x1), dim=-1)
	x_rope = x * torch.cat((cos, cos), dim=-1) + rotated * torch.cat((sin, sin), dim=-1)
	return x_rope


	class MultiHeadSelfAttention(nn.Module):
	"""
	Multi-Head Self Attention dengan causal mask.
	Setiap token hanya bisa "melihat" token sebelumnya (autoregressive).
	"""

	def __init__(self, config: SLMConfig):
	super().__init__()
	self.num_heads = config.num_heads
	self.head_dim = config.embed_dim // config.num_heads
	self.embed_dim = config.embed_dim

	assert config.embed_dim % config.num_heads == 0, \
	"embed_dim harus bisa dibagi num_heads"

	# Q, K, V projections
	self.q_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
	self.k_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
	self.v_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)

	# Output projection
	self.out_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)

	# RoPE
	self.rope = RotaryPositionalEncoding(self.head_dim, config.max_seq_len)

	# Dropout
	self.attn_dropout = nn.Dropout(config.dropout)

	def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	batch_size, seq_len, _ = x.shape

	# Project to Q, K, V
	q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
	k = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
	v = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

	# Apply RoPE
	cos, sin = self.rope(seq_len)
	q = apply_rotary_emb(q, cos, sin)
	k = apply_rotary_emb(k, cos, sin)

	# Scaled dot-product attention
	scale = math.sqrt(self.head_dim)
	attn_weights = torch.matmul(q, k.transpose(-2, -1)) / scale

	# Causal mask (prevent looking at future tokens)
	if mask is None:
	mask = torch.triu(
	torch.ones(seq_len, seq_len, device=x.device, dtype=torch.bool),
	diagonal=1
	)
	attn_weights = attn_weights.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))

	# Softmax + dropout
	attn_weights = F.softmax(attn_weights, dim=-1)
	attn_weights = self.attn_dropout(attn_weights)

	# Apply attention to values
	output = torch.matmul(attn_weights, v)

	# Reshape and project output
	output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
	output = self.out_proj(output)

	return output


	class FeedForward(nn.Module):
	"""
	Feed-Forward Network dengan SwiGLU activation.
	Teknik modern yang dipakai LLaMA, Mistral, dll.
	"""

	def __init__(self, config: SLMConfig):
	super().__init__()
	self.gate_proj = nn.Linear(config.embed_dim, config.ffn_dim, bias=False)
	self.up_proj = nn.Linear(config.embed_dim, config.ffn_dim, bias=False)
	self.down_proj = nn.Linear(config.ffn_dim, config.embed_dim, bias=False)
	self.dropout = nn.Dropout(config.dropout)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	# SwiGLU: swish(gate) * up
	gate = F.silu(self.gate_proj(x))
	up = self.up_proj(x)
	x = gate * up
	x = self.down_proj(x)
	x = self.dropout(x)
	return x


	class TransformerBlock(nn.Module):
	"""
	Satu block Transformer: Attention → FFN, dengan RMSNorm + residual.
	"""

	def __init__(self, config: SLMConfig):
	super().__init__()
	self.attention = MultiHeadSelfAttention(config)
	self.feed_forward = FeedForward(config)
	self.attn_norm = RMSNorm(config.embed_dim, config.layer_norm_eps)
	self.ffn_norm = RMSNorm(config.embed_dim, config.layer_norm_eps)

	def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
	# Pre-norm attention + residual
	x = x + self.attention(self.attn_norm(x), mask)

	# Pre-norm FFN + residual
	x = x + self.feed_forward(self.ffn_norm(x))

	return x


	class SmallLM(nn.Module):
	"""
	Small Language Model (SLM) - GPT-style Transformer.

	Arsitektur:
	Token Embedding + RoPE
	→ N × TransformerBlock (Attention + FFN)
	→ RMSNorm
	→ Output Linear (predict next token)
	"""

	def __init__(self, config: SLMConfig):
	super().__init__()
	self.config = config

	# Token embedding
	self.token_embedding = nn.Embedding(config.vocab_size, config.embed_dim)

	# Transformer blocks
	self.layers = nn.ModuleList([
	TransformerBlock(config) for _ in range(config.num_layers)
	])

	# Final norm
	self.norm = RMSNorm(config.embed_dim, config.layer_norm_eps)

	# Output head (predict next token)
	self.lm_head = nn.Linear(config.embed_dim, config.vocab_size, bias=False)

	# Weight tying (embedding weights = output weights)
	self.lm_head.weight = self.token_embedding.weight

	# Initialize weights
	self.apply(self._init_weights)

	def _init_weights(self, module):
	"""Xavier/Kaiming initialization."""
	if isinstance(module, nn.Linear):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
	if module.bias is not None:
	torch.nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Embedding):
	torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

	def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
	"""
	Forward pass.

	Args:
	input_ids: Token IDs, shape (batch, seq_len)

	Returns:
	Logits, shape (batch, seq_len, vocab_size)
	"""
	# Token embedding
	x = self.token_embedding(input_ids)

	# Causal mask
	seq_len = input_ids.shape[1]
	mask = torch.triu(
	torch.ones(seq_len, seq_len, device=input_ids.device, dtype=torch.bool),
	diagonal=1
	)

	# Transformer blocks
	for layer in self.layers:
	x = layer(x, mask)

	# Final norm + project to vocab
	x = self.norm(x)
	logits = self.lm_head(x)

	return logits

	def count_parameters(self) -> int:
	"""Count total trainable parameters."""
	return sum(p.numel() for p in self.parameters() if p.requires_grad)

	@torch.no_grad()
	def generate(
	self,
	input_ids: torch.Tensor,
	max_new_tokens: int = 50,
	temperature: float = 0.8,
	top_k: int = 40,
	top_p: float = 0.9,
	) -> torch.Tensor:
	"""
	Autoregressive text generation.

	Args:
	input_ids: Starting token IDs, shape (1, seq_len)
	max_new_tokens: Maximum tokens to generate
	temperature: Sampling temperature (lower = more deterministic)
	top_k: Top-k sampling
	top_p: Nucleus (top-p) sampling
	"""
	self.eval()

	for _ in range(max_new_tokens):
	# Crop to max context length
	idx_cond = input_ids[:, -self.config.max_seq_len:]

	# Forward pass
	logits = self(idx_cond)
	logits = logits[:, -1, :] / temperature

	# Top-k filtering
	if top_k > 0:
	top_k_values, _ = torch.topk(logits, min(top_k, logits.size(-1)))
	logits[logits < top_k_values[:, [-1]]] = float('-inf')

	# Top-p (nucleus) filtering
	if top_p < 1.0:
	sorted_logits, sorted_indices = torch.sort(logits, descending=True)
	cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
	sorted_mask = cumulative_probs - F.softmax(sorted_logits, dim=-1) >= top_p
	sorted_logits[sorted_mask] = float('-inf')
	logits = sorted_logits.scatter(1, sorted_indices, sorted_logits)

	# Sample
	probs = F.softmax(logits, dim=-1)
	next_token = torch.multinomial(probs, num_samples=1)

	# Append
	input_ids = torch.cat([input_ids, next_token], dim=1)

	# Stop on EOS
	if next_token.item() == 3: # <EOS> token
	break

	return input_ids

	def save_pretrained(self, directory: str):
	"""Save model weights and config."""
	os.makedirs(directory, exist_ok=True)

	# Save config
	self.config.save(os.path.join(directory, "config.json"))

	# Save weights as safetensors
	from safetensors.torch import save_file
	# Exclude lm_head.weight since it's tied to token_embedding.weight
	state_dict = {}
	for k, v in self.state_dict().items():
	if k != "lm_head.weight": # skip tied weight
	state_dict[k] = v
	save_file(state_dict, os.path.join(directory, "model.safetensors"))

	print(f"💾 Model saved to: {directory}")

	@classmethod
	def from_pretrained(cls, directory: str, device: str = "cpu") -> "SmallLM":
	"""Load model from directory."""
	config = SLMConfig.load(os.path.join(directory, "config.json"))
	model = cls(config)

	from safetensors.torch import load_file
	state_dict = load_file(os.path.join(directory, "model.safetensors"))
	# Restore tied weight
	if "lm_head.weight" not in state_dict and "token_embedding.weight" in state_dict:
	state_dict["lm_head.weight"] = state_dict["token_embedding.weight"]
	model.load_state_dict(state_dict)
	model.to(device)
	model.eval()

	print(f"✅ Model loaded from: {directory}")
	print(f" Parameters: {model.count_parameters():,}")
	return model


	if __name__ == "__main__":
	# Quick test
	config = SLMConfig()
	model = SmallLM(config)

	print(f"🧠 SmallLM Architecture")
	print(f" Parameters: {model.count_parameters():,}")
	print(f" Config: {config}")

	# Test forward pass
	dummy_input = torch.randint(0, config.vocab_size, (2, 32))
	logits = model(dummy_input)
	print(f"\n Input shape: {dummy_input.shape}")
	print(f" Output shape: {logits.shape}")
	print(f" ✅ Forward pass works!")