# ============================================================================
# GeoLIP ViT: HuggingFace AutoModel
#
# Usage:
#   from transformers import AutoModel
#   model = AutoModel.from_pretrained("AbstractPhil/geolip-vit-base-x3",
#                                      trust_remote_code=True)
#
#   from torchvision import transforms
#   transform = transforms.Compose([
#       transforms.Resize((224, 224)),
#       transforms.ToTensor(),
#       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
#   ])
#   pixel_values = transform(image).unsqueeze(0)
#   outputs = model(pixel_values)
#
#   # 128-d embedding on hypersphere (L2-normalized)
#   embedding = outputs.embedding              # (B, 128)
#
#   # Multi-label classification logits (80 COCO classes)
#   logits = outputs.logits                    # (B, 80) — if soup_enabled
#
#   # Triangulation distances to 256 constellation anchors
#   triangulation = outputs.triangulation      # (B, 256)
#
#   # Nearest anchor index per sample
#   nearest = outputs.nearest                  # (B,)
#
#   # Geometric diagnostics
#   diagnostics = outputs.diagnostics          # dict
# ============================================================================

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import PretrainedConfig, PreTrainedModel
from dataclasses import dataclass, field
from typing import Optional, Dict, Any


# ══════════════════════════════════════════════════════════════════
# CONFIG
# ══════════════════════════════════════════════════════════════════

class GeoLIPViTConfig(PretrainedConfig):
    model_type = "geolip_vit"

    def __init__(
        self,
        image_size=224,
        patch_size=16,
        hidden_size=384,
        num_attention_heads=6,
        num_hidden_layers=6,
        intermediate_size=1536,
        output_dim=128,
        n_anchors=256,
        n_comp=8,
        d_comp=64,
        n_classes=80,
        hidden_dropout_prob=0.1,
        soup_enabled=True,
        consensus_cv=0.2731,
        experts=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.image_size = image_size
        self.patch_size = patch_size
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.num_hidden_layers = num_hidden_layers
        self.intermediate_size = intermediate_size
        self.output_dim = output_dim
        self.n_anchors = n_anchors
        self.n_comp = n_comp
        self.d_comp = d_comp
        self.n_classes = n_classes
        self.hidden_dropout_prob = hidden_dropout_prob
        self.soup_enabled = soup_enabled
        self.consensus_cv = consensus_cv
        self.experts = experts or ["clip_l14_openai", "dinov2_b14", "siglip_b16_384"]


# ══════════════════════════════════════════════════════════════════
# OUTPUT
# ══════════════════════════════════════════════════════════════════

@dataclass
class GeoLIPViTOutput:
    """
    Output fields:
        embedding:      (B, output_dim)   L2-normalized on hypersphere
        logits:         (B, n_classes)    multi-label classification (if soup_enabled)
        triangulation:  (B, n_anchors)    distances to constellation anchors
        nearest:        (B,)              nearest anchor index
        patch_tokens:   (B, n_patches, hidden_size)  pre-pooling patch representations
        diagnostics:    dict              geometric metrics
    """
    embedding: torch.Tensor = None
    logits: Optional[torch.Tensor] = None
    triangulation: Optional[torch.Tensor] = None
    nearest: Optional[torch.Tensor] = None
    patch_tokens: Optional[torch.Tensor] = None
    diagnostics: Optional[Dict[str, Any]] = None


# ══════════════════════════════════════════════════════════════════
# GEOMETRIC COMPONENTS
# ══════════════════════════════════════════════════════════════════

class Constellation(nn.Module):
    def __init__(self, n_anchors, d):
        super().__init__()
        self.n_anchors = n_anchors
        self.anchors = nn.Parameter(F.normalize(torch.randn(n_anchors, d), dim=-1))

    def triangulate(self, emb):
        a = F.normalize(self.anchors, dim=-1)
        cos = emb @ a.T
        return 1.0 - cos, cos.argmax(dim=-1)


class Patchwork(nn.Module):
    def __init__(self, n_anchors, n_comp, d_comp):
        super().__init__()
        self.n_comp = n_comp
        self.n_anchors = n_anchors
        asgn = torch.arange(n_anchors) % n_comp
        self.register_buffer("asgn", asgn)
        # Compute input sizes from ints, not tensors (meta-tensor safe)
        anchors_per_comp = n_anchors // n_comp
        remainder = n_anchors % n_comp
        self.comps = nn.ModuleList([nn.Sequential(
            nn.Linear(anchors_per_comp + (1 if k < remainder else 0), d_comp * 2),
            nn.GELU(),
            nn.Linear(d_comp * 2, d_comp), nn.LayerNorm(d_comp))
            for k in range(n_comp)])

    def forward(self, tri):
        return torch.cat([self.comps[k](tri[:, self.asgn == k])
                         for k in range(self.n_comp)], -1)


# ══════════════════════════════════════════════════════════════════
# MODEL
# ══════════════════════════════════════════════════════════════════

class GeoLIPViTModel(PreTrainedModel):
    """
    From-scratch Vision Transformer producing L2-normalized embeddings
    on a 128-d hypersphere, geometrically anchored by a constellation
    of 256 reference points trained via 3-expert consensus distillation.

    The encoder is trained from Xavier initialization against consensus
    targets from CLIP ViT-L/14, DINOv2 ViT-B/14, and SigLIP ViT-B/16.

    Optional soup pipeline (constellation + patchwork + classifier)
    provides multi-label COCO classification from the embedding.

    Output fields:
        embedding:      (B, 128)    L2-normalized, consensus-aligned
        logits:         (B, 80)     multi-label COCO logits (if soup_enabled)
        triangulation:  (B, 256)    distances to constellation anchors
        nearest:        (B,)        nearest anchor index
        patch_tokens:   (B, 196, 384)  pre-pooling patch representations
        diagnostics:    dict        geometric metrics
    """
    config_class = GeoLIPViTConfig

    def __init__(self, config):
        super().__init__(config)
        self.config = config

        n_patches = (config.image_size // config.patch_size) ** 2

        # ── Encoder ──
        self.patch_embed = nn.Conv2d(
            3, config.hidden_size,
            kernel_size=config.patch_size, stride=config.patch_size)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
        self.pos_embed = nn.Parameter(
            torch.zeros(1, n_patches + 1, config.hidden_size))
        self.embed_norm = nn.LayerNorm(config.hidden_size)
        self.embed_drop = nn.Dropout(config.hidden_dropout_prob)

        # Individual layers for geometric injection between each
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=config.hidden_size,
                nhead=config.num_attention_heads,
                dim_feedforward=config.intermediate_size,
                dropout=config.hidden_dropout_prob,
                activation="gelu",
                batch_first=True,
                norm_first=True)
            for _ in range(config.num_hidden_layers)])

        # Geometric injection: pool → anchor_dim → triangulate → hidden_size
        self.geo_pool_proj = nn.Linear(config.hidden_size, config.output_dim)
        self.geo_tri_proj = nn.Sequential(
            nn.Linear(config.n_anchors, config.hidden_size), nn.GELU(),
            nn.LayerNorm(config.hidden_size))

        self.output_proj = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.GELU(),
            nn.LayerNorm(config.hidden_size),
            nn.Linear(config.hidden_size, config.output_dim),
        )

        # ── Soup Pipeline (optional) ──
        if getattr(config, "soup_enabled", False):
            self.constellation = Constellation(config.n_anchors, config.output_dim)
            self.patchwork = Patchwork(
                config.n_anchors, config.n_comp, config.d_comp)
            pw_dim = config.n_comp * config.d_comp
            self.classifier = nn.Sequential(
                nn.Linear(pw_dim + config.output_dim, pw_dim),
                nn.GELU(), nn.LayerNorm(pw_dim), nn.Dropout(0.0),
                nn.Linear(pw_dim, config.n_classes))
        else:
            self.constellation = None
            self.patchwork = None
            self.classifier = None

        self.post_init()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Conv2d):
            nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.LayerNorm):
            nn.init.ones_(module.weight)
            nn.init.zeros_(module.bias)

    def forward(self, pixel_values, output_patch_tokens=False, **kwargs):
        B = pixel_values.shape[0]

        # ── Encode ──
        x = self.patch_embed(pixel_values)
        x = x.flatten(2).transpose(1, 2)

        cls = self.cls_token.expand(B, -1, -1)
        x = torch.cat([cls, x], dim=1)
        x = x + self.pos_embed
        x = self.embed_drop(self.embed_norm(x))

        # ── Transformer with geometric injection ──
        # Get anchors for triangulation (from constellation if available)
        if self.constellation is not None:
            anchors_n = F.normalize(self.constellation.anchors.detach(), dim=-1)
        else:
            anchors_n = None

        for layer in self.layers:
            if anchors_n is not None:
                # Pool → project → triangulate → geo token
                pooled = x[:, 1:, :].mean(dim=1)
                geo_128 = F.normalize(self.geo_pool_proj(pooled), dim=-1)
                tri_dists = 1.0 - geo_128 @ anchors_n.T
                geo_token = self.geo_tri_proj(tri_dists).unsqueeze(1)
                x_with_geo = torch.cat([geo_token, x], dim=1)
                x_with_geo = layer(x_with_geo)
                x = x_with_geo[:, 1:, :]
            else:
                x = layer(x)

        # ── Pool + Project ──
        patch_tokens = x[:, 1:, :]
        pooled = patch_tokens.mean(dim=1)
        embedding = F.normalize(self.output_proj(pooled), dim=-1)

        # ── Soup Pipeline ──
        logits = None
        triangulation = None
        nearest = None
        diagnostics = {}

        if self.constellation is not None:
            tri, near = self.constellation.triangulate(embedding)
            triangulation = tri
            nearest = near

            if self.patchwork is not None and self.classifier is not None:
                pw = self.patchwork(tri)
                logits = self.classifier(torch.cat([pw, embedding], -1))

            # Geometric diagnostics
            with torch.no_grad():
                anchors_n = F.normalize(self.constellation.anchors, dim=-1)
                cos_to_anchors = embedding @ anchors_n.T
                diagnostics = {
                    "nearest_cos": cos_to_anchors.max(dim=-1).values.mean().item(),
                    "mean_anchor_cos": cos_to_anchors.mean().item(),
                    "n_active_anchors": near.unique().numel(),
                    "embedding_norm": embedding.norm(dim=-1).mean().item(),
                }

        return GeoLIPViTOutput(
            embedding=embedding,
            logits=logits,
            triangulation=triangulation,
            nearest=nearest,
            patch_tokens=patch_tokens if output_patch_tokens else None,
            diagnostics=diagnostics,
        )