"""
================================================================================
SENTINEL NEURAL ARCHITECTURE SEARCH
================================================================================

Theory: The Sentinel prior P(n) ∝ zⁿ/nⁿ penalizes complexity
super-exponentially. This makes it ideal for NAS: it prefers shallow,
efficient architectures.

Key Innovation: Use Sentinel prior as the architecture sampling
distribution. Deeper/more complex architectures are penalized
super-exponentially, guiding search toward efficient designs.
"""

import numpy as np
import torch
import torch.nn as nn
from typing import List, Dict, Tuple, Optional

class SentinelNAS:
    """
    Neural Architecture Search with Sentinel prior.
    
    Architecture score:
        score(arch) = performance(arch) · P(depth) · P(width) · P(params)
    
    where P(n) ∝ zⁿ/nⁿ is the Sentinel prior.
    
    This penalizes:
    - Deep architectures: P(depth) = z^{depth} / depth^{depth}
    - Wide architectures: P(width) = z^{width} / width^{width}
    - Large parameter counts: P(params) = z^{log(params)} / log(params)^{log(params)}
    """
    
    def __init__(self, z: float = 1.0, max_depth: int = 10,
                 max_width: int = 512, n_samples: int = 100):
        self.z = z
        self.max_depth = max_depth
        self.max_width = max_width
        self.n_samples = n_samples
        
        # Precompute Sentinel PMF
        self.depth_pmf = self._compute_sentinel_pmf(max_depth)
        self.width_pmf = self._compute_sentinel_pmf(max_width)
    
    def _compute_sentinel_pmf(self, max_n: int) -> np.ndarray:
        """Compute P(n) ∝ zⁿ/nⁿ for n = 1, ..., max_n."""
        probs = []
        for n in range(1, max_n + 1):
            try:
                p = (self.z ** n) / (n ** n)
            except OverflowError:
                p = 0.0
            probs.append(p)
        
        probs = np.array(probs)
        probs = probs / probs.sum()
        return probs
    
    def sample_architecture(self) -> Dict:
        """Sample an architecture from the Sentinel prior."""
        # Sample depth
        depth = np.random.choice(range(1, self.max_depth + 1), p=self.depth_pmf)
        
        # Sample width for each layer
        widths = []
        for _ in range(depth):
            width = np.random.choice(range(1, self.max_width + 1), p=self.width_pmf)
            widths.append(min(width, 128))  # Cap for practicality
        
        # Architecture config
        arch = {
            'depth': depth,
            'widths': widths,
            'activation': 'sentinel',
            'skip_connections': depth > 3  # Add skips for deep networks
        }
        
        return arch
    
    def compute_sentinel_score(self, arch: Dict,
                                performance: float,
                                params: int) -> float:
        """
        Compute architecture score with Sentinel prior.
        
        score = performance · P(depth) · P(width) · P(params)
        """
        depth = arch['depth']
        widths = arch['widths']
        
        # Depth penalty
        depth_penalty = self.depth_pmf[min(depth - 1, self.max_depth - 1)]
        
        # Width penalty (average across layers)
        width_penalties = []
        for w in widths:
            w_idx = min(w - 1, self.max_width - 1)
            width_penalties.append(self.width_pmf[w_idx])
        width_penalty = np.mean(width_penalties)
        
        # Parameter penalty (log-scale)
        log_params = int(np.log2(max(params, 1))) + 1
        param_penalty = self._sentinel_prob(log_params)
        
        # Combined score (higher is better)
        score = performance * depth_penalty * width_penalty * param_penalty
        
        return score
    
    def _sentinel_prob(self, n: int) -> float:
        """Compute P(n) = zⁿ/nⁿ."""
        if n <= 0:
            return 1.0
        try:
            return (self.z ** n) / (n ** n)
        except OverflowError:
            return 0.0
    
    def search(self, evaluator, n_trials: int = 50) -> Tuple[Dict, float]:
        """
        Run NAS with Sentinel prior.
        
        Args:
            evaluator: Function that takes architecture and returns (performance, params)
            n_trials: Number of architectures to evaluate
        
        Returns:
            best_arch: Best architecture found
            best_score: Best score
        """
        best_arch = None
        best_score = -float('inf')
        
        print(f"\n--- Sentinel NAS Search ---")
        print(f"  Trials: {n_trials}")
        print(f"  Max depth: {self.max_depth}")
        print(f"  Max width: {self.max_width}")
        print(f"  Prior: P(n) ∝ {self.z}ⁿ/nⁿ")
        
        for trial in range(n_trials):
            arch = self.sample_architecture()
            performance, params = evaluator(arch)
            score = self.compute_sentinel_score(arch, performance, params)
            
            if score > best_score:
                best_score = score
                best_arch = arch
            
            if trial < 10 or trial % 10 == 0:
                print(f"  Trial {trial+1}: depth={arch['depth']}, "
                      f"perf={performance:.3f}, params={params:,}, "
                      f"score={score:.6f}")
        
        return best_arch, best_score


def build_architecture(arch: Dict, input_dim: int, output_dim: int) -> nn.Module:
    """Build PyTorch model from architecture config."""
    layers = []
    in_dim = input_dim
    
    for i, width in enumerate(arch['widths']):
        layers.append(nn.Linear(in_dim, width))
        
        if arch.get('activation') == 'sentinel':
            layers.append(SentinelActivationLayer())
        else:
            layers.append(nn.ReLU())
        
        layers.append(nn.Dropout(0.1))
        in_dim = width
    
    layers.append(nn.Linear(in_dim, output_dim))
    
    return nn.Sequential(*layers)


class SentinelActivationLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.inv_e = 1.0 / np.e
    
    def forward(self, x):
        return x * (1.0 / torch.cosh(self.inv_e * x))


def demo_sentinel_nas():
    """Demo Sentinel NAS on synthetic task."""
    print("=" * 70)
    print("  SENTINEL NEURAL ARCHITECTURE SEARCH")
    print("=" * 70)
    
    # Evaluator: synthetic performance function
    def evaluator(arch: Dict) -> Tuple[float, int]:
        """
        Evaluate architecture (synthetic).
        
        Deeper/wider = better performance but with diminishing returns.
        More params = lower efficiency score.
        """
        depth = arch['depth']
        widths = arch['widths']
        
        # Synthetic performance (peaks at moderate depth/width)
        optimal_depth = 4
        optimal_width = 64
        
        depth_score = 1.0 / (1.0 + abs(depth - optimal_depth) ** 2)
        width_score = np.mean([1.0 / (1.0 + abs(w - optimal_width) ** 2 / 100)
                                for w in widths])
        
        performance = 0.5 + 0.5 * (depth_score + width_score) / 2
        
        # Parameter count
        params = sum(w * w for w in widths)  # Simplified
        
        return performance, params
    
    # Run NAS
    nas = SentinelNAS(z=1.0, max_depth=8, max_width=128, n_samples=50)
    best_arch, best_score = nas.search(evaluator, n_trials=50)
    
    print(f"\n--- Best Architecture ---")
    print(f"  Depth: {best_arch['depth']}")
    print(f"  Widths: {best_arch['widths']}")
    print(f"  Skip connections: {best_arch.get('skip_connections', False)}")
    print(f"  Sentinel score: {best_score:.6f}")
    
    # Compare to random search
    print(f"\n--- Comparison: Random vs Sentinel ---")
    random_scores = []
    sentinel_scores = []
    
    for _ in range(20):
        arch_random = {'depth': np.random.randint(1, 8),
                       'widths': [np.random.randint(16, 128) for _ in range(5)],
                       'activation': 'relu'}
        perf_r, params_r = evaluator(arch_random)
        score_r = perf_r  # No prior
        random_scores.append(score_r)
        
        arch_sentinel = nas.sample_architecture()
        perf_s, params_s = evaluator(arch_sentinel)
        score_s = nas.compute_sentinel_score(arch_sentinel, perf_s, params_s)
        sentinel_scores.append(score_s)
    
    print(f"  Random search mean score: {np.mean(random_scores):.6f}")
    print(f"  Sentinel NAS mean score: {np.mean(sentinel_scores):.6f}")
    print(f"  Improvement: {(np.mean(sentinel_scores) / np.mean(random_scores) - 1) * 100:.1f}%")
    
    print(f"\n  ✓ Super-exponential prior: penalizes complexity aggressively")
    print(f"  ✓ Automatic efficiency: prefers shallow, narrow architectures")
    print(f"  ✓ No manual regularization: prior is built into sampling")
    print(f"  ✓ Theorem-backed: P(n) ∝ zⁿ/nⁿ from partition function")
    
    print(f"\n{'='*70}")
    print(f"  SENTINEL NAS: SUPER-EXPONENTIAL PRIOR FOR EFFICIENT ARCHITECTURES")
    print(f"{'='*70}")


if __name__ == '__main__':
    demo_sentinel_nas()