"""
Small Language Model (SLM) - Transformer from Scratch
======================================================
Arsitektur Transformer decoder-only (GPT-style) untuk Bahasa Indonesia.
Dibangun dari nol menggunakan PyTorch.

Author: Jekardah AI Lab
"""

import math
import json
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass, asdict
from typing import Optional


@dataclass
class SLMConfig:
    """Konfigurasi model SLM."""
    vocab_size: int = 32000
    embed_dim: int = 256
    num_heads: int = 4
    num_layers: int = 4
    ffn_dim: int = 512
    max_seq_len: int = 128
    dropout: float = 0.1
    layer_norm_eps: float = 1e-5

    def save(self, path: str):
        with open(path, "w") as f:
            json.dump(asdict(self), f, indent=2)

    @classmethod
    def load(cls, path: str) -> "SLMConfig":
        with open(path, "r") as f:
            return cls(**json.load(f))


class RMSNorm(nn.Module):
    """Root Mean Square Layer Normalization (lebih efisien dari LayerNorm)."""

    def __init__(self, dim: int, eps: float = 1e-5):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        rms = torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
        return x * rms * self.weight


class RotaryPositionalEncoding(nn.Module):
    """
    Rotary Position Embedding (RoPE).
    Teknik modern yang dipakai LLaMA, Qwen, dll.
    """

    def __init__(self, dim: int, max_seq_len: int = 128, base: float = 10000.0):
        super().__init__()
        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)

        # Precompute cos/sin
        t = torch.arange(max_seq_len).float()
        freqs = torch.outer(t, inv_freq)
        cos_cached = freqs.cos()
        sin_cached = freqs.sin()
        self.register_buffer("cos_cached", cos_cached)
        self.register_buffer("sin_cached", sin_cached)

    def forward(self, seq_len: int):
        return self.cos_cached[:seq_len], self.sin_cached[:seq_len]


def apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
    """Apply rotary embeddings to input tensor."""
    # x shape: (batch, heads, seq_len, head_dim)
    head_dim = x.shape[-1]
    x1 = x[..., : head_dim // 2]
    x2 = x[..., head_dim // 2:]

    cos = cos[:x.shape[2]].unsqueeze(0).unsqueeze(0)  # (1, 1, seq, dim/2)
    sin = sin[:x.shape[2]].unsqueeze(0).unsqueeze(0)

    rotated = torch.cat((-x2, x1), dim=-1)
    x_rope = x * torch.cat((cos, cos), dim=-1) + rotated * torch.cat((sin, sin), dim=-1)
    return x_rope


class MultiHeadSelfAttention(nn.Module):
    """
    Multi-Head Self Attention dengan causal mask.
    Setiap token hanya bisa "melihat" token sebelumnya (autoregressive).
    """

    def __init__(self, config: SLMConfig):
        super().__init__()
        self.num_heads = config.num_heads
        self.head_dim = config.embed_dim // config.num_heads
        self.embed_dim = config.embed_dim

        assert config.embed_dim % config.num_heads == 0, \
            "embed_dim harus bisa dibagi num_heads"

        # Q, K, V projections
        self.q_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
        self.k_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)
        self.v_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)

        # Output projection
        self.out_proj = nn.Linear(config.embed_dim, config.embed_dim, bias=False)

        # RoPE
        self.rope = RotaryPositionalEncoding(self.head_dim, config.max_seq_len)

        # Dropout
        self.attn_dropout = nn.Dropout(config.dropout)

    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        batch_size, seq_len, _ = x.shape

        # Project to Q, K, V
        q = self.q_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Apply RoPE
        cos, sin = self.rope(seq_len)
        q = apply_rotary_emb(q, cos, sin)
        k = apply_rotary_emb(k, cos, sin)

        # Scaled dot-product attention
        scale = math.sqrt(self.head_dim)
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / scale

        # Causal mask (prevent looking at future tokens)
        if mask is None:
            mask = torch.triu(
                torch.ones(seq_len, seq_len, device=x.device, dtype=torch.bool),
                diagonal=1
            )
        attn_weights = attn_weights.masked_fill(mask.unsqueeze(0).unsqueeze(0), float('-inf'))

        # Softmax + dropout
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = self.attn_dropout(attn_weights)

        # Apply attention to values
        output = torch.matmul(attn_weights, v)

        # Reshape and project output
        output = output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
        output = self.out_proj(output)

        return output


class FeedForward(nn.Module):
    """
    Feed-Forward Network dengan SwiGLU activation.
    Teknik modern yang dipakai LLaMA, Mistral, dll.
    """

    def __init__(self, config: SLMConfig):
        super().__init__()
        self.gate_proj = nn.Linear(config.embed_dim, config.ffn_dim, bias=False)
        self.up_proj = nn.Linear(config.embed_dim, config.ffn_dim, bias=False)
        self.down_proj = nn.Linear(config.ffn_dim, config.embed_dim, bias=False)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # SwiGLU: swish(gate) * up
        gate = F.silu(self.gate_proj(x))
        up = self.up_proj(x)
        x = gate * up
        x = self.down_proj(x)
        x = self.dropout(x)
        return x


class TransformerBlock(nn.Module):
    """
    Satu block Transformer: Attention → FFN, dengan RMSNorm + residual.
    """

    def __init__(self, config: SLMConfig):
        super().__init__()
        self.attention = MultiHeadSelfAttention(config)
        self.feed_forward = FeedForward(config)
        self.attn_norm = RMSNorm(config.embed_dim, config.layer_norm_eps)
        self.ffn_norm = RMSNorm(config.embed_dim, config.layer_norm_eps)

    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        # Pre-norm attention + residual
        x = x + self.attention(self.attn_norm(x), mask)

        # Pre-norm FFN + residual
        x = x + self.feed_forward(self.ffn_norm(x))

        return x


class SmallLM(nn.Module):
    """
    Small Language Model (SLM) - GPT-style Transformer.

    Arsitektur:
        Token Embedding + RoPE
            → N × TransformerBlock (Attention + FFN)
            → RMSNorm
            → Output Linear (predict next token)
    """

    def __init__(self, config: SLMConfig):
        super().__init__()
        self.config = config

        # Token embedding
        self.token_embedding = nn.Embedding(config.vocab_size, config.embed_dim)

        # Transformer blocks
        self.layers = nn.ModuleList([
            TransformerBlock(config) for _ in range(config.num_layers)
        ])

        # Final norm
        self.norm = RMSNorm(config.embed_dim, config.layer_norm_eps)

        # Output head (predict next token)
        self.lm_head = nn.Linear(config.embed_dim, config.vocab_size, bias=False)

        # Weight tying (embedding weights = output weights)
        self.lm_head.weight = self.token_embedding.weight

        # Initialize weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        """Xavier/Kaiming initialization."""
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.

        Args:
            input_ids: Token IDs, shape (batch, seq_len)

        Returns:
            Logits, shape (batch, seq_len, vocab_size)
        """
        # Token embedding
        x = self.token_embedding(input_ids)

        # Causal mask
        seq_len = input_ids.shape[1]
        mask = torch.triu(
            torch.ones(seq_len, seq_len, device=input_ids.device, dtype=torch.bool),
            diagonal=1
        )

        # Transformer blocks
        for layer in self.layers:
            x = layer(x, mask)

        # Final norm + project to vocab
        x = self.norm(x)
        logits = self.lm_head(x)

        return logits

    def count_parameters(self) -> int:
        """Count total trainable parameters."""
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.Tensor,
        max_new_tokens: int = 50,
        temperature: float = 0.8,
        top_k: int = 40,
        top_p: float = 0.9,
    ) -> torch.Tensor:
        """
        Autoregressive text generation.

        Args:
            input_ids: Starting token IDs, shape (1, seq_len)
            max_new_tokens: Maximum tokens to generate
            temperature: Sampling temperature (lower = more deterministic)
            top_k: Top-k sampling
            top_p: Nucleus (top-p) sampling
        """
        self.eval()

        for _ in range(max_new_tokens):
            # Crop to max context length
            idx_cond = input_ids[:, -self.config.max_seq_len:]

            # Forward pass
            logits = self(idx_cond)
            logits = logits[:, -1, :] / temperature

            # Top-k filtering
            if top_k > 0:
                top_k_values, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < top_k_values[:, [-1]]] = float('-inf')

            # Top-p (nucleus) filtering
            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_mask = cumulative_probs - F.softmax(sorted_logits, dim=-1) >= top_p
                sorted_logits[sorted_mask] = float('-inf')
                logits = sorted_logits.scatter(1, sorted_indices, sorted_logits)

            # Sample
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            # Append
            input_ids = torch.cat([input_ids, next_token], dim=1)

            # Stop on EOS
            if next_token.item() == 3:  # <EOS> token
                break

        return input_ids

    def save_pretrained(self, directory: str):
        """Save model weights and config."""
        os.makedirs(directory, exist_ok=True)

        # Save config
        self.config.save(os.path.join(directory, "config.json"))

        # Save weights as safetensors
        from safetensors.torch import save_file
        # Exclude lm_head.weight since it's tied to token_embedding.weight
        state_dict = {}
        for k, v in self.state_dict().items():
            if k != "lm_head.weight":  # skip tied weight
                state_dict[k] = v
        save_file(state_dict, os.path.join(directory, "model.safetensors"))

        print(f"💾 Model saved to: {directory}")

    @classmethod
    def from_pretrained(cls, directory: str, device: str = "cpu") -> "SmallLM":
        """Load model from directory."""
        config = SLMConfig.load(os.path.join(directory, "config.json"))
        model = cls(config)

        from safetensors.torch import load_file
        state_dict = load_file(os.path.join(directory, "model.safetensors"))
        # Restore tied weight
        if "lm_head.weight" not in state_dict and "token_embedding.weight" in state_dict:
            state_dict["lm_head.weight"] = state_dict["token_embedding.weight"]
        model.load_state_dict(state_dict)
        model.to(device)
        model.eval()

        print(f"✅ Model loaded from: {directory}")
        print(f"   Parameters: {model.count_parameters():,}")
        return model


if __name__ == "__main__":
    # Quick test
    config = SLMConfig()
    model = SmallLM(config)

    print(f"🧠 SmallLM Architecture")
    print(f"   Parameters: {model.count_parameters():,}")
    print(f"   Config: {config}")

    # Test forward pass
    dummy_input = torch.randint(0, config.vocab_size, (2, 32))
    logits = model(dummy_input)
    print(f"\n   Input shape:  {dummy_input.shape}")
    print(f"   Output shape: {logits.shape}")
    print(f"   ✅ Forward pass works!")